summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src')
-rw-r--r--src/third_party/wiredtiger/src/async/async_api.c604
-rw-r--r--src/third_party/wiredtiger/src/async/async_op.c359
-rw-r--r--src/third_party/wiredtiger/src/async/async_worker.c359
-rw-r--r--src/third_party/wiredtiger/src/block/block_addr.c202
-rw-r--r--src/third_party/wiredtiger/src/block/block_ckpt.c842
-rw-r--r--src/third_party/wiredtiger/src/block/block_compact.c221
-rw-r--r--src/third_party/wiredtiger/src/block/block_ext.c1437
-rw-r--r--src/third_party/wiredtiger/src/block/block_map.c65
-rw-r--r--src/third_party/wiredtiger/src/block/block_mgr.c433
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c330
-rw-r--r--src/third_party/wiredtiger/src/block/block_read.c212
-rw-r--r--src/third_party/wiredtiger/src/block/block_session.c305
-rw-r--r--src/third_party/wiredtiger/src/block/block_slvg.c190
-rw-r--r--src/third_party/wiredtiger/src/block/block_vrfy.c514
-rw-r--r--src/third_party/wiredtiger/src/block/block_write.c269
-rw-r--r--src/third_party/wiredtiger/src/bloom/bloom.c351
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_compact.c215
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c468
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c560
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c1025
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c1104
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c339
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c422
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_evict.c1297
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c770
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_huffman.c340
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c304
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_misc.c128
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c270
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c734
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c88
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c116
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_slvg.c2520
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c190
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c373
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_upgrade.c22
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c666
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c739
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c285
-rw-r--r--src/third_party/wiredtiger/src/btree/col_modify.c223
-rw-r--r--src/third_party/wiredtiger/src/btree/col_srch.c199
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_evict.c468
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_split.c1121
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_track.c904
-rw-r--r--src/third_party/wiredtiger/src/btree/rec_write.c5521
-rw-r--r--src/third_party/wiredtiger/src/btree/row_key.c500
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c346
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c553
-rw-r--r--src/third_party/wiredtiger/src/config/config.c745
-rw-r--r--src/third_party/wiredtiger/src/config/config_api.c105
-rw-r--r--src/third_party/wiredtiger/src/config/config_check.c370
-rw-r--r--src/third_party/wiredtiger/src/config/config_collapse.c380
-rw-r--r--src/third_party/wiredtiger/src/config/config_concat.c71
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c744
-rw-r--r--src/third_party/wiredtiger/src/config/config_ext.c44
-rw-r--r--src/third_party/wiredtiger/src/config/config_upgrade.c32
-rw-r--r--src/third_party/wiredtiger/src/conn/api_strerror.c43
-rw-r--r--src/third_party/wiredtiger/src/conn/api_version.c24
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_api.c1573
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache.c174
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c639
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_ckpt.c228
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c694
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_handle.c142
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c284
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c244
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_stat.c540
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_sweep.c187
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_backup.c540
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_bulk.c287
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_config.c65
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_ds.c524
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_dump.c400
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_file.c471
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_index.c447
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_json.c931
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_log.c380
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_metadata.c444
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c574
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_std.c625
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_table.c808
-rw-r--r--src/third_party/wiredtiger/src/include/api.h128
-rw-r--r--src/third_party/wiredtiger/src/include/async.h128
-rw-r--r--src/third_party/wiredtiger/src/include/bitstring.i316
-rw-r--r--src/third_party/wiredtiger/src/include/block.h337
-rw-r--r--src/third_party/wiredtiger/src/include/bloom.h28
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h1015
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h155
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i1216
-rw-r--r--src/third_party/wiredtiger/src/include/buf.i133
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h139
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i174
-rw-r--r--src/third_party/wiredtiger/src/include/cell.i816
-rw-r--r--src/third_party/wiredtiger/src/include/column.i201
-rw-r--r--src/third_party/wiredtiger/src/include/compact.h12
-rw-r--r--src/third_party/wiredtiger/src/include/config.h85
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h270
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.h380
-rw-r--r--src/third_party/wiredtiger/src/include/cursor.i277
-rw-r--r--src/third_party/wiredtiger/src/include/dhandle.h73
-rw-r--r--src/third_party/wiredtiger/src/include/dlh.h15
-rw-r--r--src/third_party/wiredtiger/src/include/error.h141
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h650
-rw-r--r--src/third_party/wiredtiger/src/include/flags.h88
-rw-r--r--src/third_party/wiredtiger/src/include/gcc.h152
-rw-r--r--src/third_party/wiredtiger/src/include/hardware.h60
-rw-r--r--src/third_party/wiredtiger/src/include/intpack.i371
-rw-r--r--src/third_party/wiredtiger/src/include/lint.h56
-rw-r--r--src/third_party/wiredtiger/src/include/log.h177
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h232
-rw-r--r--src/third_party/wiredtiger/src/include/meta.h58
-rw-r--r--src/third_party/wiredtiger/src/include/misc.h221
-rw-r--r--src/third_party/wiredtiger/src/include/misc.i32
-rw-r--r--src/third_party/wiredtiger/src/include/msvc.h70
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.h73
-rw-r--r--src/third_party/wiredtiger/src/include/mutex.i368
-rw-r--r--src/third_party/wiredtiger/src/include/os.h72
-rw-r--r--src/third_party/wiredtiger/src/include/os_windows.h60
-rw-r--r--src/third_party/wiredtiger/src/include/packing.i685
-rw-r--r--src/third_party/wiredtiger/src/include/posix.h47
-rw-r--r--src/third_party/wiredtiger/src/include/queue.h559
-rw-r--r--src/third_party/wiredtiger/src/include/schema.h101
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i329
-rw-r--r--src/third_party/wiredtiger/src/include/session.h156
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h332
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h139
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i382
-rw-r--r--src/third_party/wiredtiger/src/include/verify_build.h75
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in3463
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger_ext.h398
-rw-r--r--src/third_party/wiredtiger/src/include/wt_internal.h337
-rw-r--r--src/third_party/wiredtiger/src/log/log.c1243
-rw-r--r--src/third_party/wiredtiger/src/log/log_auto.c437
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c354
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c1519
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_manager.c667
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_merge.c489
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_meta.c238
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c162
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c1266
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c625
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_worker.c167
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_apply.c62
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ckpt.c528
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_ext.c103
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_table.c206
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c365
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_turtle.c318
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_abort.c26
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_alloc.c238
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_dir.c94
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_dlopen.c83
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_errno.c22
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_exist.c37
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fallocate.c97
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_filesize.c55
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_flock.c37
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_fsync.c54
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_ftruncate.c26
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_getline.c48
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_getopt.c150
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_map.c136
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c157
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c227
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_once.c20
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_open.c253
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_path.c28
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_priv.c19
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_remove.c66
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_rename.c38
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_rw.c86
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_sleep.c23
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_strtouq.c24
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_thread.c59
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_time.c53
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_yield.c18
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_dir.c111
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_dlopen.c86
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_errno.c27
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_exist.c32
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fallocate.c53
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_filesize.c56
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_flock.c46
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_fsync.c40
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_ftruncate.c40
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_map.c106
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_mtx_cond.c155
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_mtx_rw.c126
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_once.c39
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_open.c219
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_path.c34
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_priv.c19
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_remove.c68
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_rename.c51
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_rw.c98
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_sleep.c18
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_thread.c51
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_time.c62
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_vsnprintf.c31
-rw-r--r--src/third_party/wiredtiger/src/os_win/os_yield.c18
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_api.c137
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_impl.c96
-rw-r--r--src/third_party/wiredtiger/src/packing/pack_stream.c296
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_create.c595
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_drop.c204
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_list.c204
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_open.c510
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_plan.c394
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_project.c474
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c276
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_stat.c114
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_truncate.c183
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_util.c84
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_worker.c134
-rw-r--r--src/third_party/wiredtiger/src/session/session_api.c1054
-rw-r--r--src/third_party/wiredtiger/src/session/session_compact.c236
-rw-r--r--src/third_party/wiredtiger/src/session/session_dhandle.c478
-rw-r--r--src/third_party/wiredtiger/src/session/session_salvage.c58
-rw-r--r--src/third_party/wiredtiger/src/support/cksum.c1306
-rw-r--r--src/third_party/wiredtiger/src/support/err.c527
-rw-r--r--src/third_party/wiredtiger/src/support/filename.c49
-rw-r--r--src/third_party/wiredtiger/src/support/global.c118
-rw-r--r--src/third_party/wiredtiger/src/support/hash_city.c323
-rw-r--r--src/third_party/wiredtiger/src/support/hash_fnv.c161
-rw-r--r--src/third_party/wiredtiger/src/support/hazard.c244
-rw-r--r--src/third_party/wiredtiger/src/support/hex.c215
-rw-r--r--src/third_party/wiredtiger/src/support/huffman.c899
-rw-r--r--src/third_party/wiredtiger/src/support/mutex.c257
-rw-r--r--src/third_party/wiredtiger/src/support/pow.c130
-rw-r--r--src/third_party/wiredtiger/src/support/rand.c69
-rw-r--r--src/third_party/wiredtiger/src/support/scratch.c319
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c567
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c554
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c944
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ext.c104
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c500
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c491
-rw-r--r--src/third_party/wiredtiger/src/utilities/util.h50
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_backup.c205
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_compact.c59
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_cpyright.c35
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_create.c53
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_drop.c50
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_dump.c701
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_list.c193
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load.c595
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load.h27
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_load_json.c573
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_loadtext.c157
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_main.c262
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_misc.c146
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_printlog.c65
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_read.c101
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_rename.c60
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_salvage.c68
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_stat.c103
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_upgrade.c63
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_verbose.c62
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_verify.c119
-rw-r--r--src/third_party/wiredtiger/src/utilities/util_write.c107
260 files changed, 88711 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/async/async_api.c b/src/third_party/wiredtiger/src/async/async_api.c
new file mode 100644
index 00000000000..3cb78e80b09
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_api.c
@@ -0,0 +1,604 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __async_get_format --
+ * Find or allocate the uri/config/format structure.
+ */
+static int
+__async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
+ const char *config, WT_ASYNC_OP_IMPL *op)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_FORMAT *af;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+ uint64_t cfg_hash, uri_hash;
+
+ async = conn->async;
+ c = NULL;
+ op->format = NULL;
+
+ if (uri != NULL)
+ uri_hash = __wt_hash_city64(uri, strlen(uri));
+ else
+ uri_hash = 0;
+ if (config != NULL)
+ cfg_hash = __wt_hash_city64(config, strlen(config));
+ else
+ cfg_hash = 0;
+
+ /*
+ * We don't need to hold a lock around this walk. The list is
+ * permanent and always valid. We might race an insert and there
+ * is a possibility a duplicate entry might be inserted, but
+ * that is not harmful.
+ */
+ STAILQ_FOREACH(af, &async->formatqh, q) {
+ if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash)
+ goto setup;
+ }
+ /*
+ * We didn't find one in the cache. Allocate and initialize one.
+ * Insert it at the head expecting LRU usage. We need a real session
+ * for the cursor.
+ */
+ WT_RET(
+ __wt_open_internal_session(conn, "async-cursor", 1, 1, &session));
+ __wt_spin_lock(session, &async->ops_lock);
+ WT_ERR(__wt_calloc_def(session, 1, &af));
+ WT_ERR(__wt_strdup(session, uri, &af->uri));
+ WT_ERR(__wt_strdup(session, config, &af->config));
+ af->uri_hash = uri_hash;
+ af->cfg_hash = cfg_hash;
+ /*
+ * Get the key_format and value_format for this URI and store
+ * it in the structure so that async->set_key/value work.
+ */
+ wt_session = &session->iface;
+ WT_ERR(wt_session->open_cursor(wt_session, uri, NULL, NULL, &c));
+ WT_ERR(__wt_strdup(session, c->key_format, &af->key_format));
+ WT_ERR(__wt_strdup(session, c->value_format, &af->value_format));
+ WT_ERR(c->close(c));
+ c = NULL;
+
+ STAILQ_INSERT_HEAD(&async->formatqh, af, q);
+ __wt_spin_unlock(session, &async->ops_lock);
+ WT_ERR(wt_session->close(wt_session, NULL));
+
+setup: op->format = af;
+ /*
+ * Copy the pointers for the formats. Items in the async format
+ * queue remain there until the connection is closed. We must
+ * initialize the format fields in the async_op, which are publicly
+ * visible, and its internal cursor used by internal key/value
+ * functions.
+ */
+ op->iface.c.key_format = op->iface.key_format = af->key_format;
+ op->iface.c.value_format = op->iface.value_format = af->value_format;
+ return (0);
+
+err:
+ if (c != NULL)
+ (void)c->close(c);
+ __wt_free(session, af->uri);
+ __wt_free(session, af->config);
+ __wt_free(session, af->key_format);
+ __wt_free(session, af->value_format);
+ __wt_free(session, af);
+ return (ret);
+}
+
+/*
+ * __async_new_op_alloc --
+ * Find and allocate the next available async op handle.
+ */
+static int
+__async_new_op_alloc(WT_SESSION_IMPL *session, const char *uri,
+ const char *config, WT_ASYNC_OP_IMPL **opp)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ uint32_t i, save_i, view;
+
+ conn = S2C(session);
+ async = conn->async;
+ WT_STAT_FAST_CONN_INCR(session, async_op_alloc);
+ *opp = NULL;
+
+retry:
+ op = NULL;
+ WT_ORDERED_READ(save_i, async->ops_index);
+ /*
+ * Look after the last one allocated for a free one. We'd expect
+ * ops to be freed mostly FIFO so we should quickly find one.
+ */
+ for (view = 1, i = save_i; i < conn->async_size; i++, view++) {
+ op = &async->async_ops[i];
+ if (op->state == WT_ASYNCOP_FREE)
+ break;
+ }
+
+ /*
+ * Loop around back to the beginning if we need to.
+ */
+ if (op == NULL || op->state != WT_ASYNCOP_FREE)
+ for (i = 0; i < save_i; i++, view++) {
+ op = &async->async_ops[i];
+ if (op->state == WT_ASYNCOP_FREE)
+ break;
+ }
+
+ /*
+ * We still haven't found one. Return an error.
+ */
+ if (op == NULL || op->state != WT_ASYNCOP_FREE) {
+ WT_STAT_FAST_CONN_INCR(session, async_full);
+ WT_RET(EBUSY);
+ }
+ /*
+ * Set the state of this op handle as READY for the user to use.
+ * If we can set the state then the op entry is ours.
+ * Start the next search at the next entry after this one.
+ */
+ if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
+ WT_STAT_FAST_CONN_INCR(session, async_alloc_race);
+ goto retry;
+ }
+ WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view);
+ WT_RET(__async_get_format(conn, uri, config, op));
+ op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1);
+ op->optype = WT_AOP_NONE;
+ (void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size);
+ *opp = op;
+ return (0);
+}
+
+/*
+ * __async_config --
+ * Parse and setup the async API options.
+ */
+static int
+__async_config(WT_SESSION_IMPL *session,
+ WT_CONNECTION_IMPL *conn, const char **cfg, int *runp)
+{
+ WT_CONFIG_ITEM cval;
+
+ /*
+ * The async configuration is off by default.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "async.enabled", &cval));
+ *runp = cval.val != 0;
+
+ /*
+ * Even if async is turned off, we want to parse and store the
+ * default values so that reconfigure can just enable them.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "async.ops_max", &cval));
+ conn->async_size = (uint32_t)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "async.threads", &cval));
+ conn->async_workers = (uint32_t)cval.val;
+ /* Sanity check that api_data.py is in sync with async.h */
+ WT_ASSERT(session, conn->async_workers <= WT_ASYNC_MAX_WORKERS);
+
+ return (0);
+}
+
+/*
+ * __wt_async_stats_update --
+ * Update the async stats for return to the application.
+ */
+void
+__wt_async_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
+
+ conn = S2C(session);
+ async = conn->async;
+ if (async == NULL)
+ return;
+ stats = &conn->stats;
+ WT_STAT_SET(stats, async_cur_queue, async->cur_queue);
+ WT_STAT_SET(stats, async_max_queue, async->max_queue);
+ F_SET(conn, WT_CONN_SERVER_ASYNC);
+}
+
+/*
+ * __async_start --
+ * Start the async subsystem. All configuration processing has
+ * already been done by the caller.
+ */
+static int
+__async_start(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ uint32_t i;
+
+ conn = S2C(session);
+ conn->async_cfg = 1;
+ /*
+ * Async is on, allocate the WT_ASYNC structure and initialize the ops.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_ASYNC), &conn->async));
+ async = conn->async;
+ STAILQ_INIT(&async->formatqh);
+ WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
+ WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond));
+ WT_RET(__wt_async_op_init(session));
+
+ /*
+ * Start up the worker threads.
+ */
+ F_SET(conn, WT_CONN_SERVER_ASYNC);
+ for (i = 0; i < conn->async_workers; i++) {
+ /*
+ * Each worker has its own session. We set both a general
+ * server flag in the connection and an individual flag
+ * in the session. The user may reconfigure the number of
+ * workers and we may want to selectively stop some workers
+ * while leaving the rest running.
+ */
+ WT_RET(__wt_open_internal_session(
+ conn, "async-worker", 1, 1, &async->worker_sessions[i]));
+ F_SET(async->worker_sessions[i], WT_SESSION_SERVER_ASYNC);
+ }
+ for (i = 0; i < conn->async_workers; i++) {
+ /*
+ * Start the threads.
+ */
+ WT_RET(__wt_thread_create(session, &async->worker_tids[i],
+ __wt_async_worker, async->worker_sessions[i]));
+ }
+ __wt_async_stats_update(session);
+ return (0);
+}
+
+/*
+ * __wt_async_create --
+ * Start the async subsystem and worker threads.
+ */
+int
+__wt_async_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ int run;
+
+ conn = S2C(session);
+
+ /* Handle configuration. */
+ run = 0;
+ WT_RET(__async_config(session, conn, cfg, &run));
+
+ /* If async is not configured, we're done. */
+ if (!run)
+ return (0);
+ return (__async_start(session));
+}
+
+/*
+ * __wt_async_reconfig --
+ * Start the async subsystem and worker threads.
+ */
+int
+__wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn, tmp_conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ int run;
+ uint32_t i;
+
+ conn = S2C(session);
+ async = conn->async;
+ memset(&tmp_conn, 0, sizeof(tmp_conn));
+ tmp_conn.async_cfg = conn->async_cfg;
+ tmp_conn.async_workers = conn->async_workers;
+ tmp_conn.async_size = conn->async_size;
+
+ /* Handle configuration. */
+ run = conn->async_cfg;
+ WT_RET(__async_config(session, &tmp_conn, cfg, &run));
+
+ /*
+ * There are some restrictions on the live reconfiguration of async.
+ * Unlike other subsystems where we simply destroy anything existing
+ * and restart with the new configuration, async is not so easy.
+ * If the user is just changing the number of workers, we want to
+ * allow the existing op handles and other information to remain in
+ * existence. So we must handle various combinations of changes
+ * individually.
+ *
+ * One restriction is that if async is currently on, the user cannot
+ * change the number of async op handles available. The user can try
+ * but we do nothing with it. However we must allow the ops_max config
+ * string so that a user can completely start async via reconfigure.
+ */
+
+ /*
+ * Easy cases:
+ * 1. If async is on and the user wants it off, shut it down.
+ * 2. If async is off, and the user wants it on, start it.
+ * 3. If not a toggle and async is off, we're done.
+ */
+ if (conn->async_cfg > 0 && !run) {
+ /* Case 1 */
+ WT_TRET(__wt_async_flush(session));
+ ret = __wt_async_destroy(session);
+ conn->async_cfg = 0;
+ return (ret);
+ } else if (conn->async_cfg == 0 && run)
+ /* Case 2 */
+ return (__async_start(session));
+ else if (conn->async_cfg == 0)
+ /* Case 3 */
+ return (0);
+
+ /*
+ * Running async worker modification cases:
+ * 4. If number of workers didn't change, we're done.
+ * 5. If more workers, start new ones.
+ * 6. If fewer workers, kill some.
+ */
+ if (conn->async_workers == tmp_conn.async_workers)
+ /* No change in the number of workers. */
+ return (0);
+ if (conn->async_workers < tmp_conn.async_workers) {
+ /* Case 5 */
+ /*
+ * The worker_sessions array is allocated for the maximum
+ * allowed number of workers, so starting more is easy.
+ */
+ for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
+ /*
+ * Each worker has its own session.
+ */
+ WT_RET(__wt_open_internal_session(conn,
+ "async-worker", 1, 1, &async->worker_sessions[i]));
+ F_SET(async->worker_sessions[i],
+ WT_SESSION_SERVER_ASYNC);
+ }
+ for (i = conn->async_workers; i < tmp_conn.async_workers; i++) {
+ /*
+ * Start the threads.
+ */
+ WT_RET(__wt_thread_create(session,
+ &async->worker_tids[i], __wt_async_worker,
+ async->worker_sessions[i]));
+ }
+ conn->async_workers = tmp_conn.async_workers;
+ }
+ if (conn->async_workers > tmp_conn.async_workers) {
+ /* Case 6 */
+ /*
+ * Stopping an individual async worker is the most complex case.
+ * We clear the session async flag on the targeted worker thread
+ * so that only that thread stops, and the others keep running.
+ */
+ for (i = conn->async_workers - 1;
+ i >= tmp_conn.async_workers; i--) {
+ /*
+ * Join any worker we're stopping.
+ * After the thread is stopped, close its session.
+ */
+ WT_ASSERT(session, async->worker_tids[i] != 0);
+ WT_ASSERT(session, async->worker_sessions[i] != NULL);
+ F_CLR(async->worker_sessions[i],
+ WT_SESSION_SERVER_ASYNC);
+ WT_TRET(__wt_thread_join(
+ session, async->worker_tids[i]));
+ async->worker_tids[i] = 0;
+ wt_session = &async->worker_sessions[i]->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ async->worker_sessions[i] = NULL;
+ }
+ conn->async_workers = tmp_conn.async_workers;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_async_destroy --
+ * Destroy the async worker threads and async subsystem.
+ */
+int
+__wt_async_destroy(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_FORMAT *af, *afnext;
+ WT_ASYNC_OP *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ uint32_t i;
+
+ conn = S2C(session);
+ async = conn->async;
+
+ if (!conn->async_cfg)
+ return (0);
+
+ F_CLR(conn, WT_CONN_SERVER_ASYNC);
+ for (i = 0; i < conn->async_workers; i++)
+ if (async->worker_tids[i] != 0) {
+ WT_TRET(__wt_thread_join(
+ session, async->worker_tids[i]));
+ async->worker_tids[i] = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &async->flush_cond));
+
+ /* Close the server threads' sessions. */
+ for (i = 0; i < conn->async_workers; i++)
+ if (async->worker_sessions[i] != NULL) {
+ wt_session = &async->worker_sessions[i]->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ async->worker_sessions[i] = NULL;
+ }
+ /* Free any op key/value buffers. */
+ for (i = 0; i < conn->async_size; i++) {
+ op = (WT_ASYNC_OP *)&async->async_ops[i];
+ if (op->c.key.data != NULL)
+ __wt_buf_free(session, &op->c.key);
+ if (op->c.value.data != NULL)
+ __wt_buf_free(session, &op->c.value);
+ }
+
+ /* Free format resources */
+ af = STAILQ_FIRST(&async->formatqh);
+ while (af != NULL) {
+ afnext = STAILQ_NEXT(af, q);
+ __wt_free(session, af->uri);
+ __wt_free(session, af->config);
+ __wt_free(session, af->key_format);
+ __wt_free(session, af->value_format);
+ __wt_free(session, af);
+ af = afnext;
+ }
+ __wt_free(session, async->async_queue);
+ __wt_free(session, async->async_ops);
+ __wt_spin_destroy(session, &async->ops_lock);
+ __wt_free(session, conn->async);
+
+ return (ret);
+}
+
+/*
+ * __wt_async_flush --
+ * Implementation of the WT_CONN->async_flush method.
+ */
+int
+__wt_async_flush(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ if (!conn->async_cfg)
+ return (0);
+
+ async = conn->async;
+ WT_STAT_FAST_CONN_INCR(session, async_flush);
+ /*
+ * We have to do several things. First we have to prevent
+ * other callers from racing with us so that only one
+ * flush is happening at a time. Next we have to wait for
+ * the worker threads to notice the flush and indicate
+ * that the flush is complete on their side. Then we
+ * clear the flush flags and return.
+ */
+retry:
+ while (async->flush_state != WT_ASYNC_FLUSH_NONE)
+ /*
+ * We're racing an in-progress flush. We need to wait
+ * our turn to start our own. We need to convoy the
+ * racing calls because a later call may be waiting for
+ * specific enqueued ops to be complete before this returns.
+ */
+ __wt_sleep(0, 100000);
+
+ if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE,
+ WT_ASYNC_FLUSH_IN_PROGRESS))
+ goto retry;
+ /*
+ * We're the owner of this flush operation. Set the
+ * WT_ASYNC_FLUSH_IN_PROGRESS to block other callers.
+ * We're also preventing all worker threads from taking
+ * things off the work queue with the lock.
+ */
+ async->flush_count = 0;
+ (void)WT_ATOMIC_ADD8(async->flush_gen, 1);
+ WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE);
+ async->flush_op.state = WT_ASYNCOP_READY;
+ WT_ERR(__wt_async_op_enqueue(session, &async->flush_op));
+ while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE)
+ WT_ERR(__wt_cond_wait(NULL, async->flush_cond, 100000));
+ /*
+ * Flush is done. Clear the flags.
+ */
+ async->flush_op.state = WT_ASYNCOP_FREE;
+ WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSH_NONE);
+err:
+ return (ret);
+}
+
+/*
+ * __async_runtime_config --
+ * Configure runtime fields at allocation.
+ */
+static int
+__async_runtime_config(WT_ASYNC_OP_IMPL *op, const char *cfg[])
+{
+ WT_ASYNC_OP *asyncop;
+ WT_CONFIG_ITEM cval;
+ WT_SESSION_IMPL *session;
+
+ session = O2S(op);
+ asyncop = (WT_ASYNC_OP *)op;
+ WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval));
+ if (cval.val)
+ F_SET(&asyncop->c, WT_CURSTD_APPEND);
+ else
+ F_CLR(&asyncop->c, WT_CURSTD_APPEND);
+ WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval));
+ if (cval.val)
+ F_SET(&asyncop->c, WT_CURSTD_OVERWRITE);
+ else
+ F_CLR(&asyncop->c, WT_CURSTD_OVERWRITE);
+ WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval));
+ if (cval.val)
+ F_SET(&asyncop->c, WT_CURSTD_RAW);
+ else
+ F_CLR(&asyncop->c, WT_CURSTD_RAW);
+ return (0);
+
+}
+
+/*
+ * __wt_async_new_op --
+ * Implementation of the WT_CONN->async_new_op method.
+ */
+int
+__wt_async_new_op(WT_SESSION_IMPL *session, const char *uri,
+ const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb,
+ WT_ASYNC_OP_IMPL **opp)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ *opp = NULL;
+
+ conn = S2C(session);
+ if (!conn->async_cfg)
+ return (ENOTSUP);
+
+ op = NULL;
+ WT_ERR(__async_new_op_alloc(session, uri, config, &op));
+ WT_ERR(__async_runtime_config(op, cfg));
+ op->cb = cb;
+ *opp = op;
+ return (0);
+
+err:
+ /*
+ * If we get an error after allocating op, set its state to free.
+ */
+ if (op != NULL)
+ op->state = WT_ASYNCOP_FREE;
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/async/async_op.c b/src/third_party/wiredtiger/src/async/async_op.c
new file mode 100644
index 00000000000..9dba2b2b5f3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_op.c
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+/*
+ * __async_get_key --
+ * WT_ASYNC_OP->get_key implementation for op handles.
+ */
+static int
+__async_get_key(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, asyncop);
+ ret = __wt_cursor_get_keyv(&asyncop->c, asyncop->c.flags, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __async_set_key --
+ * WT_ASYNC_OP->set_key implementation for op handles.
+ */
+static void
+__async_set_key(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ va_list ap;
+
+ c = &asyncop->c;
+ va_start(ap, asyncop);
+ __wt_cursor_set_keyv(c, c->flags, ap);
+ if (!WT_DATA_IN_ITEM(&c->key) && !WT_CURSOR_RECNO(c))
+ WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->key,
+ c->key.data, c->key.size));
+ va_end(ap);
+ if (0)
+err: c->saved_err = ret;
+}
+
+/*
+ * __async_get_value --
+ * WT_ASYNC_OP->get_value implementation for op handles.
+ */
+static int
+__async_get_value(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, asyncop);
+ ret = __wt_cursor_get_valuev(&asyncop->c, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __async_set_value --
+ * WT_ASYNC_OP->set_value implementation for op handles.
+ */
+static void
+__async_set_value(WT_ASYNC_OP *asyncop, ...)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ va_list ap;
+
+ c = &asyncop->c;
+ va_start(ap, asyncop);
+ __wt_cursor_set_valuev(c, ap);
+ /* Copy the data, if it is pointing at data elsewhere. */
+ if (!WT_DATA_IN_ITEM(&c->value))
+ WT_ERR(__wt_buf_set(O2S((WT_ASYNC_OP_IMPL *)asyncop),
+ &c->value, c->value.data, c->value.size));
+ va_end(ap);
+ if (0)
+err: c->saved_err = ret;
+}
+
+/*
+ * __async_op_wrap --
+ * Common wrapper for all async operations.
+ */
+static int
+__async_op_wrap(WT_ASYNC_OP_IMPL *op, WT_ASYNC_OPTYPE type)
+{
+ op->optype = type;
+ return (__wt_async_op_enqueue(O2S(op), op));
+}
+
+/*
+ * __async_search --
+ * WT_ASYNC_OP->search implementation for op handles.
+ */
+static int
+__async_search(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, search);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_search);
+ WT_ERR(__async_op_wrap(op, WT_AOP_SEARCH));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_insert --
+ * WT_ASYNC_OP->insert implementation for op handles.
+ */
+static int
+__async_insert(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, insert);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_insert);
+ WT_ERR(__async_op_wrap(op, WT_AOP_INSERT));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_update --
+ * WT_ASYNC_OP->update implementation for op handles.
+ */
+static int
+__async_update(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, update);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_update);
+ WT_ERR(__async_op_wrap(op, WT_AOP_UPDATE));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_remove --
+ * WT_ASYNC_OP->remove implementation for op handles.
+ */
+static int
+__async_remove(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, remove);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_remove);
+ WT_ERR(__async_op_wrap(op, WT_AOP_REMOVE));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_compact --
+ * WT_ASYNC_OP->compact implementation for op handles.
+ */
+static int
+__async_compact(WT_ASYNC_OP *asyncop)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ op = (WT_ASYNC_OP_IMPL *)asyncop;
+ ASYNCOP_API_CALL(O2C(op), session, compact);
+ WT_STAT_FAST_CONN_INCR(O2S(op), async_op_compact);
+ WT_ERR(__async_op_wrap(op, WT_AOP_COMPACT));
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __async_get_id --
+ * WT_ASYNC_OP->get_id implementation for op handles.
+ */
+static uint64_t
+__async_get_id(WT_ASYNC_OP *asyncop)
+{
+ return (((WT_ASYNC_OP_IMPL *)asyncop)->unique_id);
+}
+
+/*
+ * __async_get_type --
+ * WT_ASYNC_OP->get_type implementation for op handles.
+ */
+static WT_ASYNC_OPTYPE
+__async_get_type(WT_ASYNC_OP *asyncop)
+{
+ return (((WT_ASYNC_OP_IMPL *)asyncop)->optype);
+}
+
+/*
+ * __async_op_init --
+ * Initialize all the op handle fields.
+ */
+static int
+__async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id)
+{
+ WT_ASYNC_OP *asyncop;
+
+ asyncop = (WT_ASYNC_OP *)op;
+ asyncop->connection = (WT_CONNECTION *)conn;
+ asyncop->key_format = asyncop->value_format = NULL;
+ asyncop->c.key_format = asyncop->c.value_format = NULL;
+ asyncop->get_key = __async_get_key;
+ asyncop->get_value = __async_get_value;
+ asyncop->set_key = __async_set_key;
+ asyncop->set_value = __async_set_value;
+ asyncop->search = __async_search;
+ asyncop->insert = __async_insert;
+ asyncop->update = __async_update;
+ asyncop->remove = __async_remove;
+ asyncop->compact = __async_compact;
+ asyncop->get_id = __async_get_id;
+ asyncop->get_type = __async_get_type;
+ /*
+ * The cursor needs to have the get/set key/value functions initialized.
+ * It also needs the key/value related fields set up.
+ */
+ asyncop->c.get_key = __wt_cursor_get_key;
+ asyncop->c.set_key = __wt_cursor_set_key;
+ asyncop->c.get_value = __wt_cursor_get_value;
+ asyncop->c.set_value = __wt_cursor_set_value;
+ asyncop->c.recno = 0;
+ memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf));
+ memset(&asyncop->c.key, 0, sizeof(asyncop->c.key));
+ memset(&asyncop->c.value, 0, sizeof(asyncop->c.value));
+ asyncop->c.session = (WT_SESSION *)conn->default_session;
+ asyncop->c.saved_err = 0;
+ asyncop->c.flags = 0;
+
+ op->internal_id = id;
+ op->state = WT_ASYNCOP_FREE;
+ return (0);
+}
+
+/*
+ * __wt_async_op_enqueue --
+ * Enqueue an operation onto the work queue.
+ */
+int
+__wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
+{
+ WT_ASYNC *async;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint64_t cur_head, cur_tail, my_alloc, my_slot;
+#ifdef HAVE_DIAGNOSTIC
+ WT_ASYNC_OP_IMPL *my_op;
+#endif
+
+ conn = S2C(session);
+ async = conn->async;
+ /*
+ * Enqueue op at the tail of the work queue.
+ */
+ WT_ASSERT(session, op->state == WT_ASYNCOP_READY);
+ /*
+ * We get our slot in the ring buffer to use.
+ */
+ my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1);
+ my_slot = my_alloc % async->async_qsize;
+
+ /*
+ * Make sure we haven't wrapped around the queue.
+ * If so, wait for the tail to advance off this slot.
+ */
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ while (cur_tail == my_slot) {
+ __wt_yield();
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ }
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_ORDERED_READ(my_op, async->async_queue[my_slot]);
+ if (my_op != NULL)
+ return (__wt_panic(session));
+#endif
+ WT_PUBLISH(async->async_queue[my_slot], op);
+ op->state = WT_ASYNCOP_ENQUEUED;
+ if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue)
+ WT_PUBLISH(async->max_queue, async->cur_queue);
+ /*
+ * Multiple threads may be adding ops to the queue. We need to wait
+ * our turn to make our slot visible to workers.
+ */
+ WT_ORDERED_READ(cur_head, async->head);
+ while (cur_head != (my_alloc - 1)) {
+ __wt_yield();
+ WT_ORDERED_READ(cur_head, async->head);
+ }
+ WT_PUBLISH(async->head, my_alloc);
+ return (ret);
+}
+
+/*
+ * __wt_async_op_init --
+ * Initialize all the op handles.
+ */
+int
+__wt_async_op_init(WT_SESSION_IMPL *session)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint32_t i;
+
+ conn = S2C(session);
+ async = conn->async;
+
+ /*
+ * Initialize the flush op structure.
+ */
+ WT_RET(__async_op_init(conn, &async->flush_op, OPS_INVALID_INDEX));
+
+ /*
+ * Allocate and initialize the work queue. This is sized so that
+ * the ring buffer is known to be big enough such that the head
+ * can never overlap the tail. Include extra for the flush op.
+ */
+ async->async_qsize = conn->async_size + 2;
+ WT_RET(__wt_calloc_def(
+ session, async->async_qsize, &async->async_queue));
+ /*
+ * Allocate and initialize all the user ops.
+ */
+ WT_ERR(__wt_calloc_def(session, conn->async_size, &async->async_ops));
+ for (i = 0; i < conn->async_size; i++) {
+ op = &async->async_ops[i];
+ WT_ERR(__async_op_init(conn, op, i));
+ }
+ return (0);
+err:
+ if (async->async_ops != NULL) {
+ __wt_free(session, async->async_ops);
+ async->async_ops = NULL;
+ }
+ if (async->async_queue != NULL) {
+ __wt_free(session, async->async_queue);
+ async->async_queue = NULL;
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/async/async_worker.c b/src/third_party/wiredtiger/src/async/async_worker.c
new file mode 100644
index 00000000000..74ee2dd2f86
--- /dev/null
+++ b/src/third_party/wiredtiger/src/async/async_worker.c
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __async_op_dequeue --
+ * Wait for work to be available. Then atomically take it off
+ * the work queue.
+ */
+static int
+__async_op_dequeue(WT_CONNECTION_IMPL *conn, WT_SESSION_IMPL *session,
+ WT_ASYNC_OP_IMPL **op)
+{
+ WT_ASYNC *async;
+ long sleep_usec;
+ uint64_t cur_tail, last_consume, my_consume, my_slot, prev_slot;
+ uint32_t tries;
+
+ async = conn->async;
+ *op = NULL;
+ /*
+ * Wait for work to do. Work is available when async->head moves.
+ * Then grab the slot containing the work. If we lose, try again.
+ */
+retry:
+ tries = 0;
+ sleep_usec = 100;
+ WT_ORDERED_READ(last_consume, async->alloc_tail);
+ /*
+ * We stay in this loop until there is work to do.
+ */
+ while (last_consume == async->head &&
+ async->flush_state != WT_ASYNC_FLUSHING) {
+ WT_STAT_FAST_CONN_INCR(session, async_nowork);
+ if (++tries < MAX_ASYNC_YIELD)
+ /*
+ * Initially when we find no work, allow other
+ * threads to run.
+ */
+ __wt_yield();
+ else {
+ /*
+ * If we haven't found work in a while, start sleeping
+ * to wait for work to arrive instead of spinning.
+ */
+ __wt_sleep(0, sleep_usec);
+ sleep_usec = WT_MIN(sleep_usec * 2,
+ MAX_ASYNC_SLEEP_USECS);
+ }
+ if (!F_ISSET(session, WT_SESSION_SERVER_ASYNC))
+ return (0);
+ if (!F_ISSET(conn, WT_CONN_SERVER_ASYNC))
+ return (0);
+ if (F_ISSET(conn, WT_CONN_PANIC))
+ return (__wt_panic(session));
+ WT_ORDERED_READ(last_consume, async->alloc_tail);
+ }
+ if (async->flush_state == WT_ASYNC_FLUSHING)
+ return (0);
+ /*
+ * Try to increment the tail to claim this slot. If we lose
+ * a race, try again.
+ */
+ my_consume = last_consume + 1;
+ if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume))
+ goto retry;
+ /*
+ * This item of work is ours to process. Clear it out of the
+ * queue and return.
+ */
+ my_slot = my_consume % async->async_qsize;
+ prev_slot = last_consume % async->async_qsize;
+ *op = WT_ATOMIC_STORE8(async->async_queue[my_slot], NULL);
+
+ WT_ASSERT(session, async->cur_queue > 0);
+ WT_ASSERT(session, *op != NULL);
+ WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED);
+ (void)WT_ATOMIC_SUB4(async->cur_queue, 1);
+ (*op)->state = WT_ASYNCOP_WORKING;
+
+ if (*op == &async->flush_op)
+ /*
+ * We're the worker to take the flush op off the queue.
+ */
+ WT_PUBLISH(async->flush_state, WT_ASYNC_FLUSHING);
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ while (cur_tail != prev_slot) {
+ __wt_yield();
+ WT_ORDERED_READ(cur_tail, async->tail_slot);
+ }
+ WT_PUBLISH(async->tail_slot, my_slot);
+ return (0);
+}
+
+/*
+ * __async_flush_wait --
+ * Wait for the final worker to finish flushing.
+ */
+static int
+__async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen)
+{
+ WT_DECL_RET;
+
+ while (async->flush_state == WT_ASYNC_FLUSHING &&
+ async->flush_gen == my_gen)
+ WT_ERR(__wt_cond_wait(session, async->flush_cond, 10000));
+err: return (ret);
+}
+
+/*
+ * __async_worker_cursor --
+ * Return a cursor for the worker thread to use for its op.
+ * The worker thread caches cursors. So first search for one
+ * with the same config/uri signature. Otherwise open a new
+ * cursor and cache it.
+ */
+static int
+__async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+ WT_ASYNC_WORKER_STATE *worker, WT_CURSOR **cursorp)
+{
+ WT_ASYNC_CURSOR *ac;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ wt_session = (WT_SESSION *)session;
+ *cursorp = NULL;
+ /*
+ * Compact doesn't need a cursor.
+ */
+ if (op->optype == WT_AOP_COMPACT)
+ return (0);
+ WT_ASSERT(session, op->format != NULL);
+ STAILQ_FOREACH(ac, &worker->cursorqh, q) {
+ if (op->format->cfg_hash == ac->cfg_hash &&
+ op->format->uri_hash == ac->uri_hash) {
+ /*
+ * If one of our cached cursors has a matching
+ * signature, use it and we're done.
+ */
+ *cursorp = ac->c;
+ return (0);
+ }
+ }
+ /*
+ * We didn't find one in our cache. Open one and cache it.
+ * Insert it at the head expecting LRU usage.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &ac));
+ WT_ERR(wt_session->open_cursor(
+ wt_session, op->format->uri, NULL, op->format->config, &c));
+ ac->cfg_hash = op->format->cfg_hash;
+ ac->uri_hash = op->format->uri_hash;
+ ac->c = c;
+ STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
+ worker->num_cursors++;
+ *cursorp = c;
+ return (0);
+
+err: __wt_free(session, ac);
+ return (ret);
+}
+
+/*
+ * __async_worker_execop --
+ * A worker thread executes an individual op with a cursor.
+ */
+static int
+__async_worker_execop(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+ WT_CURSOR *cursor)
+{
+ WT_ASYNC_OP *asyncop;
+ WT_ITEM val;
+ WT_SESSION *wt_session;
+
+ asyncop = (WT_ASYNC_OP *)op;
+ /*
+ * Set the key of our local cursor from the async op handle.
+ * If needed, also set the value.
+ */
+ if (op->optype != WT_AOP_COMPACT) {
+ WT_RET(__wt_cursor_get_raw_key(&asyncop->c, &val));
+ __wt_cursor_set_raw_key(cursor, &val);
+ if (op->optype == WT_AOP_INSERT ||
+ op->optype == WT_AOP_UPDATE) {
+ WT_RET(__wt_cursor_get_raw_value(&asyncop->c, &val));
+ __wt_cursor_set_raw_value(cursor, &val);
+ }
+ }
+ switch (op->optype) {
+ case WT_AOP_COMPACT:
+ wt_session = &session->iface;
+ WT_RET(wt_session->compact(wt_session,
+ op->format->uri, op->format->config));
+ break;
+ case WT_AOP_INSERT:
+ WT_RET(cursor->insert(cursor));
+ break;
+ case WT_AOP_UPDATE:
+ WT_RET(cursor->update(cursor));
+ break;
+ case WT_AOP_REMOVE:
+ WT_RET(cursor->remove(cursor));
+ break;
+ case WT_AOP_SEARCH:
+ WT_RET(cursor->search(cursor));
+ /*
+ * Get the value from the cursor and put it into
+ * the op for op->get_value.
+ */
+ WT_RET(__wt_cursor_get_raw_value(cursor, &val));
+ __wt_cursor_set_raw_value(&asyncop->c, &val);
+ break;
+ case WT_AOP_NONE:
+ default:
+ WT_RET_MSG(session, EINVAL, "Unknown async optype %d\n",
+ op->optype);
+ }
+ return (0);
+}
+
+/*
+ * __async_worker_op --
+ * A worker thread handles an individual op.
+ */
+static int
+__async_worker_op(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
+ WT_ASYNC_WORKER_STATE *worker)
+{
+ WT_ASYNC_OP *asyncop;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ int cb_ret;
+
+ asyncop = (WT_ASYNC_OP *)op;
+
+ cb_ret = 0;
+
+ wt_session = &session->iface;
+ if (op->optype != WT_AOP_COMPACT)
+ WT_RET(wt_session->begin_transaction(wt_session, NULL));
+ WT_ASSERT(session, op->state == WT_ASYNCOP_WORKING);
+ WT_RET(__async_worker_cursor(session, op, worker, &cursor));
+ /*
+ * Perform op and invoke the callback.
+ */
+ ret = __async_worker_execop(session, op, cursor);
+ if (op->cb != NULL && op->cb->notify != NULL)
+ cb_ret = op->cb->notify(op->cb, asyncop, ret, 0);
+
+ /*
+ * If the operation succeeded and the user callback returned
+ * zero then commit. Otherwise rollback.
+ */
+ if (op->optype != WT_AOP_COMPACT) {
+ if ((ret == 0 || ret == WT_NOTFOUND) && cb_ret == 0)
+ WT_TRET(wt_session->commit_transaction(
+ wt_session, NULL));
+ else
+ WT_TRET(wt_session->rollback_transaction(
+ wt_session, NULL));
+ F_CLR(&asyncop->c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ WT_TRET(cursor->reset(cursor));
+ }
+ /*
+ * After the callback returns, and the transaction resolved release
+ * the op back to the free pool. We do this regardless of
+ * success or failure.
+ */
+ WT_PUBLISH(op->state, WT_ASYNCOP_FREE);
+ return (ret);
+}
+
+/*
+ * __async_worker --
+ * The async worker threads.
+ */
+void *
+__wt_async_worker(void *arg)
+{
+ WT_ASYNC *async;
+ WT_ASYNC_CURSOR *ac, *acnext;
+ WT_ASYNC_OP_IMPL *op;
+ WT_ASYNC_WORKER_STATE worker;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t flush_gen;
+
+ session = arg;
+ conn = S2C(session);
+ async = conn->async;
+
+ worker.num_cursors = 0;
+ STAILQ_INIT(&worker.cursorqh);
+ while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) &&
+ F_ISSET(session, WT_SESSION_SERVER_ASYNC)) {
+ WT_ERR(__async_op_dequeue(conn, session, &op));
+ if (op != NULL && op != &async->flush_op) {
+ /*
+ * If an operation fails, we want the worker thread to
+ * keep running, unless there is a panic.
+ */
+ (void)__async_worker_op(session, op, &worker);
+ if (F_ISSET(conn, WT_CONN_PANIC))
+ WT_ERR(__wt_panic(session));
+ } else if (async->flush_state == WT_ASYNC_FLUSHING) {
+ /*
+ * Worker flushing going on. Last worker to the party
+ * needs to clear the FLUSHING flag and signal the cond.
+ * If FLUSHING is going on, we do not take anything off
+ * the queue.
+ */
+ WT_ORDERED_READ(flush_gen, async->flush_gen);
+ if (WT_ATOMIC_ADD4(async->flush_count, 1) ==
+ conn->async_workers) {
+ /*
+ * We're last. All workers accounted for so
+ * signal the condition and clear the FLUSHING
+ * flag to release the other worker threads.
+ * Set the FLUSH_COMPLETE flag so that the
+ * caller can return to the application.
+ */
+ WT_PUBLISH(async->flush_state,
+ WT_ASYNC_FLUSH_COMPLETE);
+ WT_ERR(__wt_cond_signal(session,
+ async->flush_cond));
+ } else
+ /*
+ * We need to wait for the last worker to
+ * signal the condition.
+ */
+ WT_ERR(__async_flush_wait(
+ session, async, flush_gen));
+ }
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "async worker error");
+ }
+ /*
+ * Worker thread cleanup, close our cached cursors and
+ * free all the WT_ASYNC_CURSOR structures.
+ */
+ ac = STAILQ_FIRST(&worker.cursorqh);
+ while (ac != NULL) {
+ acnext = STAILQ_NEXT(ac, q);
+ WT_TRET(ac->c->close(ac->c));
+ __wt_free(session, ac);
+ ac = acnext;
+ }
+ return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_addr.c b/src/third_party/wiredtiger/src/block/block_addr.c
new file mode 100644
index 00000000000..bbd52359157
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_addr.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __block_buffer_to_addr --
+ * Convert a filesystem address cookie into its components, UPDATING the
+ * caller's buffer reference so it can be called repeatedly to load a buffer.
+ */
+static int
+__block_buffer_to_addr(WT_BLOCK *block,
+ const uint8_t **pp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+ uint64_t o, s, c;
+
+ WT_RET(__wt_vunpack_uint(pp, 0, &o));
+ WT_RET(__wt_vunpack_uint(pp, 0, &s));
+ WT_RET(__wt_vunpack_uint(pp, 0, &c));
+
+ /*
+ * To avoid storing large offsets, we minimize the value by subtracting
+ * a block for description information, then storing a count of block
+ * allocation units. That implies there is no such thing as an
+ * "invalid" offset though, they could all be valid (other than very
+ * large numbers), which is what we didn't want to store in the first
+ * place. Use the size: writing a block of size 0 makes no sense, so
+ * that's the out-of-band value. Once we're out of this function and
+ * are working with a real file offset, size and checksum triplet, there
+ * can be invalid offsets, that's simpler than testing sizes of 0 all
+ * over the place.
+ */
+ if (s == 0) {
+ *offsetp = 0;
+ *sizep = *cksump = 0;
+ } else {
+ *offsetp = (wt_off_t)(o + 1) * block->allocsize;
+ *sizep = (uint32_t)s * block->allocsize;
+ *cksump = (uint32_t)c;
+ }
+ return (0);
+}
+
+/*
+ * __wt_block_addr_to_buffer --
+ * Convert the filesystem components into its address cookie.
+ */
+int
+__wt_block_addr_to_buffer(WT_BLOCK *block,
+ uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum)
+{
+ uint64_t o, s, c;
+
+ /* See the comment above: this is the reverse operation. */
+ if (size == 0) {
+ o = WT_BLOCK_INVALID_OFFSET;
+ s = c = 0;
+ } else {
+ o = (uint64_t)offset / block->allocsize - 1;
+ s = size / block->allocsize;
+ c = cksum;
+ }
+ WT_RET(__wt_vpack_uint(pp, 0, o));
+ WT_RET(__wt_vpack_uint(pp, 0, s));
+ WT_RET(__wt_vpack_uint(pp, 0, c));
+ return (0);
+}
+
+/*
+ * __wt_block_buffer_to_addr --
+ * Convert a filesystem address cookie into its components NOT UPDATING
+ * the caller's buffer reference.
+ */
+int
+__wt_block_buffer_to_addr(WT_BLOCK *block,
+ const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump)
+{
+ return (__block_buffer_to_addr(block, &p, offsetp, sizep, cksump));
+}
+
+/*
+ * __wt_block_addr_valid --
+ * Return if an address cookie is valid.
+ */
+int
+__wt_block_addr_valid(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live)
+{
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(session);
+ WT_UNUSED(addr_size);
+ WT_UNUSED(live);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * In diagnostic mode, verify the address isn't on the available list,
+ * or for live systems, the discard list.
+ */
+ WT_RET(__wt_block_misplaced(
+ session, block, "addr-valid", offset, size, live));
+#endif
+
+ /* Check if it's past the end of the file. */
+ return (offset + size > block->fh->size ? 0 : 1);
+}
+
+/*
+ * __wt_block_addr_string --
+ * Return a printable string representation of an address cookie.
+ */
+int
+__wt_block_addr_string(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /* Printable representation. */
+ WT_RET(__wt_buf_fmt(session, buf,
+ "[%" PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)offset, (uintmax_t)offset + size, size, cksum));
+
+ return (0);
+}
+
+/*
+ * __wt_block_buffer_to_ckpt --
+ * Convert a checkpoint cookie into its components.
+ */
+int
+__wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci)
+{
+ uint64_t a;
+ const uint8_t **pp;
+
+ ci->version = *p++;
+ if (ci->version != WT_BM_CHECKPOINT_VERSION)
+ WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
+
+ pp = &p;
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->root_offset, &ci->root_size, &ci->root_cksum));
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->alloc.offset, &ci->alloc.size, &ci->alloc.cksum));
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->avail.offset, &ci->avail.size, &ci->avail.cksum));
+ WT_RET(__block_buffer_to_addr(block, pp,
+ &ci->discard.offset, &ci->discard.size, &ci->discard.cksum));
+ WT_RET(__wt_vunpack_uint(pp, 0, &a));
+ ci->file_size = (wt_off_t)a;
+ WT_RET(__wt_vunpack_uint(pp, 0, &a));
+ ci->ckpt_size = a;
+
+ return (0);
+}
+
+/*
+ * __wt_block_ckpt_to_buffer --
+ * Convert the components into its checkpoint cookie.
+ */
+int
+__wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci)
+{
+ uint64_t a;
+
+ if (ci->version != WT_BM_CHECKPOINT_VERSION)
+ WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version");
+
+ (*pp)[0] = ci->version;
+ (*pp)++;
+
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->root_offset, ci->root_size, ci->root_cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->alloc.offset, ci->alloc.size, ci->alloc.cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->avail.offset, ci->avail.size, ci->avail.cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, pp,
+ ci->discard.offset, ci->discard.size, ci->discard.cksum));
+ a = (uint64_t)ci->file_size;
+ WT_RET(__wt_vpack_uint(pp, 0, a));
+ a = (uint64_t)ci->ckpt_size;
+ WT_RET(__wt_vpack_uint(pp, 0, a));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_ckpt.c b/src/third_party/wiredtiger/src/block/block_ckpt.c
new file mode 100644
index 00000000000..83c3a40e8e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_ckpt.c
@@ -0,0 +1,842 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __ckpt_string(
+ WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
+static int __ckpt_update(
+ WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, int);
+
+/*
+ * __wt_block_ckpt_init --
+ * Initialize a checkpoint structure.
+ */
+int
+__wt_block_ckpt_init(
+ WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
+{
+ WT_CLEAR(*ci);
+
+ ci->version = WT_BM_CHECKPOINT_VERSION;
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+
+ WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", 0));
+ WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", 1));
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->discard, name, "discard", 0));
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->ckpt_avail, name, "ckpt_avail", 1));
+
+ return (0);
+}
+
+/*
+ * __wt_block_checkpoint_load --
+ * Load a checkpoint.
+ */
+int
+__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ const uint8_t *addr, size_t addr_size,
+ uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint8_t *endp;
+
+ WT_UNUSED(addr_size);
+ ci = NULL;
+
+ /*
+ * Sometimes we don't find a root page (we weren't given a checkpoint,
+ * or the checkpoint was empty). In that case we return an empty root
+ * address, set that up now.
+ */
+ *root_addr_sizep = 0;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ if (addr != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(session, block, addr, tmp));
+ }
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "%s: load-checkpoint: %s", block->name,
+ addr == NULL ? "[Empty]" : (const char *)tmp->data));
+ }
+
+ /*
+ * There's a single checkpoint in the file that can be written, all of
+ * the others are read-only. We use the same initialization calls for
+ * readonly checkpoints, but the information doesn't persist.
+ */
+ if (checkpoint) {
+ ci = &_ci;
+ WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
+ } else {
+ /*
+ * We depend on the btree level for locking: things will go
+ * bad fast should we open the live system in two handles, or
+ * if we create, salvage, truncate or verify the live/running
+ * file, for that matter.
+ */
+ ci = &block->live;
+ WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
+ }
+
+ /*
+ * If the checkpoint has an on-disk root page, load it. Otherwise, size
+ * the file past the description information.
+ */
+ if (addr == NULL || addr_size == 0)
+ ci->file_size = block->allocsize;
+ else {
+ /* Crack the checkpoint cookie. */
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ /* Verify sets up next. */
+ if (block->verify)
+ WT_ERR(__wt_verify_ckpt_load(session, block, ci));
+
+ /* Read any root page. */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
+ endp = root_addr;
+ WT_ERR(__wt_block_addr_to_buffer(block, &endp,
+ ci->root_offset, ci->root_size, ci->root_cksum));
+ *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
+ }
+
+ /*
+ * Rolling a checkpoint forward requires the avail list, the
+ * blocks from which we can allocate.
+ */
+ if (!checkpoint)
+ WT_ERR(__wt_block_extlist_read_avail(
+ session, block, &ci->avail, ci->file_size));
+ }
+
+ /*
+ * If the checkpoint can be written, that means anything written after
+ * the checkpoint is no longer interesting, truncate the file. Don't
+ * bother checking the avail list for a block at the end of the file,
+ * that was done when the checkpoint was first written (re-writing the
+ * checkpoint might possibly make it relevant here, but it's unlikely
+ * enough I don't bother).
+ */
+ if (!checkpoint) {
+ /*
+ * The truncate might fail if there's a file mapping (if there's
+ * an open checkpoint on the file), that's OK.
+ */
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size));
+ WT_ERR_BUSY_OK(
+ __wt_ftruncate(session, block->fh, ci->file_size));
+ }
+
+ if (0) {
+err: /*
+ * Don't call checkpoint-unload: unload does real work including
+ * file truncation. If we fail early enough that the checkpoint
+ * information isn't correct, bad things would happen. The only
+ * allocated memory was in the service of verify, clean that up.
+ */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block));
+ }
+
+ /* Checkpoints don't need the original information, discard it. */
+ if (checkpoint && ci != NULL)
+ __wt_block_ckpt_destroy(session, ci);
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_unload --
+ * Unload a checkpoint.
+ */
+int
+__wt_block_checkpoint_unload(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint)
+{
+ WT_DECL_RET;
+
+ /* Verify cleanup. */
+ if (block->verify)
+ WT_TRET(__wt_verify_ckpt_unload(session, block));
+
+ /*
+ * If it's the live system, truncate to discard any extended blocks and
+ * discard the active extent lists. Hold the lock even though we're
+ * unloading the live checkpoint, there could be readers active in
+ * other checkpoints.
+ */
+ if (!checkpoint) {
+ /*
+ * The truncate might fail if there's a file mapping (if there's
+ * an open checkpoint on the file), that's OK.
+ */
+ WT_TRET_BUSY_OK(
+ __wt_ftruncate(session, block->fh, block->fh->size));
+
+ __wt_spin_lock(session, &block->live_lock);
+ __wt_block_ckpt_destroy(session, &block->live);
+ __wt_spin_unlock(session, &block->live_lock);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_block_ckpt_destroy --
+ * Clear a checkpoint structure.
+ */
+void
+__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
+{
+ /* Discard the extent lists. */
+ __wt_block_extlist_free(session, &ci->alloc);
+ __wt_block_extlist_free(session, &ci->avail);
+ __wt_block_extlist_free(session, &ci->discard);
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+}
+
+/*
+ * __wt_block_checkpoint --
+ * Create a new checkpoint.
+ */
+int
+__wt_block_checkpoint(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /*
+ * Write the root page: it's possible for there to be a checkpoint of
+ * an empty tree, in which case, we store an illegal root offset.
+ *
+ * !!!
+ * We happen to know that checkpoints are single-threaded above us in
+ * the btree engine. That's probably something we want to guarantee
+ * for any WiredTiger block manager.
+ */
+ if (buf == NULL) {
+ ci->root_offset = WT_BLOCK_INVALID_OFFSET;
+ ci->root_size = ci->root_cksum = 0;
+ } else
+ WT_RET(__wt_block_write_off(session, block, buf,
+ &ci->root_offset, &ci->root_size, &ci->root_cksum,
+ data_cksum, 0));
+
+ /*
+ * Checkpoints are potentially reading/writing/merging lots of blocks,
+ * pre-allocate structures for this thread's use.
+ */
+ WT_RET(__wt_block_ext_prealloc(session, 250));
+
+ /* Process the checkpoint list, deleting and updating as required. */
+ ret = __ckpt_process(session, block, ckptbase);
+
+ /* Discard any excessive memory we've allocated. */
+ WT_TRET(__wt_block_ext_discard(session, 250));
+
+ return (ret);
+}
+
+/*
+ * __ckpt_extlist_read --
+ * Read a checkpoints extent lists and copy
+ */
+static int
+__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci;
+
+ /*
+ * Allocate a checkpoint structure, crack the cookie and read the
+ * checkpoint's extent lists.
+ *
+ * Ignore the avail list: checkpoint avail lists are only useful if we
+ * are rolling forward from the particular checkpoint and they represent
+ * our best understanding of what blocks can be allocated. If we are
+ * not operating on the live checkpoint, subsequent checkpoints might
+ * have allocated those blocks, and the avail list is useless. We don't
+ * discard it, because it is useful as part of verification, but we
+ * don't re-write it either.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));
+
+ ci = ckpt->bpriv;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+ WT_RET(__wt_block_extlist_read(
+ session, block, &ci->alloc, ci->file_size));
+ WT_RET(__wt_block_extlist_read(
+ session, block, &ci->discard, ci->file_size));
+
+ return (0);
+}
+
+/*
+ * __ckpt_extlist_fblocks --
+ * If a checkpoint's extent list is going away, free its blocks.
+ */
+static int
+__ckpt_extlist_fblocks(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+ /*
+ * Free blocks used to write checkpoint extents into the live system's
+ * checkpoint avail list (they were never on any alloc list). Do not
+ * use the live system's avail list because that list is used to decide
+ * if the file can be truncated, and we can't truncate any part of the
+ * file that contains a previous checkpoint's extents.
+ */
+ return (__wt_block_insert_ext(
+ session, &block->live.ckpt_avail, el->offset, el->size));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __ckpt_verify --
+ * Diagnostic code, confirm we get what we expect in the checkpoint array.
+ */
+static int
+__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt;
+
+ /*
+ * Fast check that we're seeing what we expect to see: some number of
+ * checkpoints to add, delete or ignore, terminated by a new checkpoint.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ switch (ckpt->flags) {
+ case 0:
+ case WT_CKPT_DELETE:
+ case WT_CKPT_DELETE | WT_CKPT_FAKE:
+ case WT_CKPT_FAKE:
+ break;
+ case WT_CKPT_ADD:
+ if (ckpt[1].name == NULL)
+ break;
+ /* FALLTHROUGH */
+ default:
+ return (
+ __wt_illegal_value(session, "checkpoint array"));
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __ckpt_process --
+ * Process the list of checkpoints.
+ */
+static int
+__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+ WT_BLOCK_CKPT *a, *b, *ci;
+ WT_CKPT *ckpt, *next_ckpt;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint64_t ckpt_size;
+ int deleting, locked;
+
+ ci = &block->live;
+ locked = 0;
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(__ckpt_verify(session, ckptbase));
+#endif
+
+ /*
+ * Checkpoints are a two-step process: first, write a new checkpoint to
+ * disk (including all the new extent lists for modified checkpoints
+ * and the live system). As part of this, create a list of file blocks
+ * newly available for reallocation, based on checkpoints being deleted.
+ * We then return the locations of the new checkpoint information to our
+ * caller. Our caller has to write that information into some kind of
+ * stable storage, and once that's done, we can actually allocate from
+ * that list of newly available file blocks. (We can't allocate from
+ * that list immediately because the allocation might happen before our
+ * caller saves the new checkpoint information, and if we crashed before
+ * the new checkpoint location was saved, we'd have overwritten blocks
+ * still referenced by checkpoints in the system.) In summary, there is
+ * a second step: after our caller saves the checkpoint information, we
+ * are called to add the newly available blocks into the live system's
+ * available list.
+ *
+ * This function is the first step, the second step is in the resolve
+ * function.
+ *
+ * If we're called to checkpoint the same file twice, without the second
+ * resolution step, it's an error at an upper level and our choices are
+ * all bad: either leak blocks or risk crashing with our caller not
+ * having saved the checkpoint information to stable storage. Leaked
+ * blocks are a safer choice, but that means file verify will fail for
+ * the rest of "forever", and the chance of us allocating a block and
+ * then crashing such that it matters is reasonably low: don't leak the
+ * blocks.
+ */
+ if (block->ckpt_inprogress) {
+ __wt_errx(session,
+ "%s: checkpointed without the checkpoint being resolved",
+ block->name);
+
+ WT_RET(__wt_block_checkpoint_resolve(session, block));
+ }
+
+ /*
+ * Extents newly available as a result of deleting previous checkpoints
+ * are added to a list of extents. The list should be empty, but as
+ * described above, there is no "free the checkpoint information" call
+ * into the block manager; if there was an error in an upper level that
+ * resulted in some previous checkpoint never being resolved, the list
+ * may not be empty. We should have caught that with the "checkpoint
+ * in progress" test, but it doesn't cost us anything to be cautious.
+ *
+ * We free the checkpoint's allocation and discard extent lists as part
+ * of the resolution step, not because they're needed at that time, but
+ * because it's potentially a lot of work, and waiting allows the btree
+ * layer to continue eviction sooner. As for the checkpoint-available
+ * list, make sure they get cleaned out.
+ */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ WT_RET(__wt_block_extlist_init(
+ session, &ci->ckpt_avail, "live", "ckpt_avail", 1));
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+
+ /*
+ * To delete a checkpoint, we'll need checkpoint information for it and
+ * the subsequent checkpoint into which it gets rolled; read them from
+ * disk before we lock things down.
+ */
+ deleting = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+ !F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ deleting = 1;
+
+ /*
+ * Read the checkpoint and next checkpoint extent lists if we
+ * haven't already read them (we may have already read these
+ * extent blocks if there is more than one deleted checkpoint).
+ */
+ if (ckpt->bpriv == NULL)
+ WT_ERR(__ckpt_extlist_read(session, block, ckpt));
+
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
+
+ /*
+ * The "next" checkpoint may be the live tree which has no
+ * extent blocks to read.
+ */
+ if (next_ckpt->bpriv == NULL &&
+ !F_ISSET(next_ckpt, WT_CKPT_ADD))
+ WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
+ }
+
+ /*
+ * Hold a lock so the live extent lists and the file size can't change
+ * underneath us. I suspect we'll tighten this if checkpoints take too
+ * much time away from real work: we read the historic checkpoint
+ * information without a lock, but we could also merge and re-write the
+ * deleted and merged checkpoint information without a lock, except for
+ * the final merge of ranges into the live tree.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ locked = 1;
+
+ /*
+ * We've allocated our last page, update the checkpoint size. We need
+ * to calculate the live system's checkpoint size before merging
+ * checkpoint allocation and discard information from the checkpoints
+ * we're deleting, those operations change the underlying byte counts.
+ */
+ ckpt_size = ci->ckpt_size;
+ ckpt_size += ci->alloc.bytes;
+ ckpt_size -= ci->discard.bytes;
+
+ /* Skip the additional processing if we aren't deleting checkpoints. */
+ if (!deleting)
+ goto live_update;
+
+ /*
+ * Delete any no-longer-needed checkpoints: we do this first as it frees
+ * blocks to the live lists, and the freed blocks will then be included
+ * when writing the live extent lists.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
+ !F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ if (tmp == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(
+ session, block, ckpt->raw.data, tmp));
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "%s: delete-checkpoint: %s: %s",
+ block->name, ckpt->name, (const char *)tmp->data));
+ }
+
+ /*
+ * Find the checkpoint into which we'll roll this checkpoint's
+ * blocks: it's the next real checkpoint in the list, and it
+ * better have been read in (if it's not the add slot).
+ */
+ for (next_ckpt = ckpt + 1;; ++next_ckpt)
+ if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
+ break;
+
+ /*
+ * Set the from/to checkpoint structures, where the "to" value
+ * may be the live tree.
+ */
+ a = ckpt->bpriv;
+ if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+ b = &block->live;
+ else
+ b = next_ckpt->bpriv;
+
+ /*
+ * Free the root page: there's nothing special about this free,
+ * the root page is allocated using normal rules, that is, it
+ * may have been taken from the avail list, and was entered on
+ * the live system's alloc list at that time. We free it into
+ * the checkpoint's discard list, however, not the live system's
+ * list because it appears on the checkpoint's alloc list and so
+ * must be paired in the checkpoint.
+ */
+ if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_ERR(__wt_block_insert_ext(session,
+ &a->discard, a->root_offset, a->root_size));
+
+ /*
+ * Free the blocks used to hold the "from" checkpoint's extent
+ * lists, including the avail list.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
+
+ /*
+ * Roll the "from" alloc and discard extent lists into the "to"
+ * checkpoint's lists.
+ */
+ if (a->alloc.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(
+ session, &a->alloc, &b->alloc));
+ if (a->discard.entries != 0)
+ WT_ERR(__wt_block_extlist_merge(
+ session, &a->discard, &b->discard));
+
+ /*
+ * If the "to" checkpoint is also being deleted, we're done with
+ * it, it's merged into some other checkpoint in the next loop.
+ * This means the extent lists may aggregate over a number of
+ * checkpoints, but that's OK, they're disjoint sets of ranges.
+ */
+ if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
+ continue;
+
+ /*
+ * Find blocks for re-use: wherever the "to" checkpoint's
+ * allocate and discard lists overlap, move the range to
+ * the live system's checkpoint available list.
+ */
+ WT_ERR(__wt_block_extlist_overlap(session, block, b));
+
+ /*
+ * If we're updating the live system's information, we're done.
+ */
+ if (F_ISSET(next_ckpt, WT_CKPT_ADD))
+ continue;
+
+ /*
+ * We have to write the "to" checkpoint's extent lists out in
+ * new blocks, and update its cookie.
+ *
+ * Free the blocks used to hold the "to" checkpoint's extent
+ * lists; don't include the avail list, it's not changing.
+ */
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
+ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
+
+ F_SET(next_ckpt, WT_CKPT_UPDATE);
+ }
+
+ /* Update checkpoints marked for update. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_UPDATE))
+ WT_ERR(__ckpt_update(
+ session, block, ckpt, ckpt->bpriv, 0));
+
+live_update:
+ /* Truncate the file if that's possible. */
+ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));
+
+ /* Update the final, added checkpoint based on the live system. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+ /*
+ * Set the checkpoint size for the live system.
+ *
+ * !!!
+ * Our caller wants the final checkpoint size. Setting
+ * the size here violates layering, but the alternative
+ * is a call for the btree layer to crack the checkpoint
+ * cookie into its components, and that's a fair amount
+ * of work.
+ */
+ ckpt->ckpt_size = ci->ckpt_size = ckpt_size;
+
+ WT_ERR(__ckpt_update(session, block, ckpt, ci, 1));
+ }
+
+ /*
+ * Reset the live system's alloc and discard extent lists, leave the
+ * avail list alone. This includes freeing a lot of extents, so do it
+ * outside of the system's lock by copying and resetting the original,
+ * then doing the work later.
+ */
+ ci->ckpt_alloc = ci->alloc;
+ WT_ERR(__wt_block_extlist_init(
+ session, &ci->alloc, "live", "alloc", 0));
+ ci->ckpt_discard = ci->discard;
+ WT_ERR(__wt_block_extlist_init(
+ session, &ci->discard, "live", "discard", 0));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * The first checkpoint in the system should always have an empty
+ * discard list. If we've read that checkpoint and/or created it,
+ * check.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ break;
+ if ((a = ckpt->bpriv) == NULL)
+ a = &block->live;
+ if (a->discard.entries != 0) {
+ __wt_errx(session,
+ "first checkpoint incorrectly has blocks on the discard "
+ "list");
+ WT_ERR(WT_ERROR);
+ }
+#endif
+
+ block->ckpt_inprogress = 1;
+
+err: if (locked)
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard any checkpoint information we loaded. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if ((ci = ckpt->bpriv) != NULL)
+ __wt_block_ckpt_destroy(session, ci);
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __ckpt_update --
+ * Update a checkpoint.
+ */
+static int
+__ckpt_update(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, int is_live)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ uint8_t *endp;
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Check the extent list combinations for overlaps. */
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
+ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
+#endif
+ /*
+ * Write the checkpoint's alloc and discard extent lists. After each
+ * write, remove any allocated blocks from the system's allocation
+ * list, checkpoint extent blocks don't appear on any extent lists.
+ */
+ WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
+ WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));
+
+ /*
+ * We only write an avail list for the live system, other checkpoint's
+ * avail lists are static and never change.
+ *
+ * Write the avail list last so it reflects changes due to allocating
+ * blocks for the alloc and discard lists. Second, when we write the
+ * live system's avail list, it's two lists: the current avail list
+ * plus the list of blocks to be made available when the new checkpoint
+ * completes. We can't merge that second list into the real list yet,
+ * it's not truly available until the new checkpoint locations have been
+ * saved to the metadata.
+ */
+ if (is_live)
+ WT_RET(__wt_block_extlist_write(
+ session, block, &ci->avail, &ci->ckpt_avail));
+
+ /*
+ * Set the file size for the live system.
+ *
+ * !!!
+ * We do NOT set the file size when re-writing checkpoints because we
+ * want to test the checkpoint's blocks against a reasonable maximum
+ * file size during verification. This is bad: imagine a checkpoint
+ * appearing early in the file, re-written, and then the checkpoint
+ * requires blocks at the end of the file, blocks after the listed file
+ * size. If the application opens that checkpoint for writing
+ * (discarding subsequent checkpoints), we would truncate the file to
+ * the early chunk, discarding the re-written checkpoint information.
+ * The alternative, updating the file size has its own problems, in
+ * that case we'd work correctly, but we'd lose all of the blocks
+ * between the original checkpoint and the re-written checkpoint.
+ * Currently, there's no API to roll-forward intermediate checkpoints,
+ * if there ever is, this will need to be fixed.
+ */
+ if (is_live)
+ ci->file_size = block->fh->size;
+
+ /*
+ * Copy the checkpoint information into the checkpoint array's address
+ * cookie.
+ */
+ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
+ endp = ckpt->raw.mem;
+ WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
+ ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "%s: create-checkpoint: %s: %s",
+ block->name, ckpt->name, (const char *)tmp->data));
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_checkpoint_resolve --
+ * Resolve a checkpoint.
+ */
+int
+__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_BLOCK_CKPT *ci;
+ WT_DECL_RET;
+
+ ci = &block->live;
+
+ /*
+ * Resolve the checkpoint after our caller has written the checkpoint
+ * information to stable storage.
+ */
+ if (!block->ckpt_inprogress)
+ WT_RET_MSG(session, WT_ERROR,
+ "%s: checkpoint resolved, but no checkpoint in progress",
+ block->name);
+ block->ckpt_inprogress = 0;
+
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_extlist_merge(session, &ci->ckpt_avail, &ci->avail);
+ __wt_spin_unlock(session, &block->live_lock);
+
+ /* Discard the lists remaining after the checkpoint call. */
+ __wt_block_extlist_free(session, &ci->ckpt_avail);
+ __wt_block_extlist_free(session, &ci->ckpt_alloc);
+ __wt_block_extlist_free(session, &ci->ckpt_discard);
+
+ return (ret);
+}
+
+/*
+ * __ckpt_string --
+ * Return a printable string representation of a checkpoint address cookie.
+ */
+static int
+__ckpt_string(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+
+ /* Initialize the checkpoint, crack the cookie. */
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, ci, "string"));
+ WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));
+
+ WT_RET(__wt_buf_fmt(session, buf,
+ "version=%d",
+ ci->version));
+ if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", root=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->root_offset,
+ (uintmax_t)(ci->root_offset + ci->root_size),
+ ci->root_size, ci->root_cksum));
+ if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", alloc=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->alloc.offset,
+ (uintmax_t)(ci->alloc.offset + ci->alloc.size),
+ ci->alloc.size, ci->alloc.cksum));
+ if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", avail=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->avail.offset,
+ (uintmax_t)(ci->avail.offset + ci->avail.size),
+ ci->avail.size, ci->avail.cksum));
+ if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
+ else
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", discard=[%"
+ PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
+ (uintmax_t)ci->discard.offset,
+ (uintmax_t)(ci->discard.offset + ci->discard.size),
+ ci->discard.size, ci->discard.cksum));
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));
+
+ __wt_block_ckpt_destroy(session, ci);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_compact.c b/src/third_party/wiredtiger/src/block/block_compact.c
new file mode 100644
index 00000000000..007c77f3291
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_compact.c
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __block_dump_avail(WT_SESSION_IMPL *, WT_BLOCK *);
+
+/*
+ * __wt_block_compact_start --
+ * Start compaction of a file.
+ */
+int
+__wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_UNUSED(session);
+
+ /*
+ * Save the current allocation plan, switch to first-fit allocation.
+ * We don't need the lock, but it's not a performance question and
+ * might avoid bugs in the future.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ block->allocfirst_save = block->allocfirst;
+ block->allocfirst = 1;
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (0);
+}
+
+/*
+ * __wt_block_compact_end --
+ * End compaction of a file.
+ */
+int
+__wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_UNUSED(session);
+
+ /*
+ * Restore the previous allocation plan.
+ * We don't need the lock, but it's not a performance question and
+ * might avoid bugs in the future.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ block->allocfirst = block->allocfirst_save;
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (0);
+}
+
+/*
+ * __wt_block_compact_skip --
+ * Return if compaction will shrink the file.
+ */
+int
+__wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp)
+{
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+ WT_FH *fh;
+ wt_off_t avail, ninety;
+
+ *skipp = 1; /* Return a default skip. */
+
+ fh = block->fh;
+
+ /*
+ * We do compaction by copying blocks from the end of the file to the
+ * beginning of the file, and we need some metrics to decide if it's
+ * worth doing. Ignore small files, and files where we are unlikely
+ * to recover 10% of the file.
+ */
+ if (fh->size <= 10 * 1024)
+ return (0);
+
+ __wt_spin_lock(session, &block->live_lock);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_COMPACT))
+ WT_ERR(__block_dump_avail(session, block));
+
+ /* Sum the number of available bytes in the first 90% of the file. */
+ avail = 0;
+ ninety = fh->size - fh->size / 10;
+
+ el = &block->live.avail;
+ WT_EXT_FOREACH(ext, el->off)
+ if (ext->off < ninety)
+ avail += ext->size;
+
+ /*
+ * If at least 10% of the total file is available and in the first 90%
+ * of the file, we'll try compaction.
+ */
+ if (avail >= fh->size / 10)
+ *skipp = 0;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_COMPACT,
+ "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
+ "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX
+ ") to perform compaction, compaction %s",
+ block->name,
+ (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail,
+ (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
+ *skipp ? "skipped" : "proceeding"));
+
+err: __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_compact_page_skip --
+ * Return if writing a particular page will shrink the file.
+ */
+int
+__wt_block_compact_page_skip(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp)
+{
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+ WT_FH *fh;
+ wt_off_t ninety, offset;
+ uint32_t size, cksum;
+
+ WT_UNUSED(addr_size);
+ *skipp = 1; /* Return a default skip. */
+
+ fh = block->fh;
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ __wt_spin_lock(session, &block->live_lock);
+
+ /*
+ * If this block is in the last 10% of the file and there's a block on
+ * the available list that's in the first 90% of the file, rewrite the
+ * block. Checking the available list is necessary (otherwise writing
+ * the block would extend the file), but there's an obvious race if the
+ * file is sufficiently busy.
+ */
+ ninety = fh->size - fh->size / 10;
+ if (offset > ninety) {
+ el = &block->live.avail;
+ WT_EXT_FOREACH(ext, el->off)
+ if (ext->off < ninety && ext->size >= size) {
+ *skipp = 0;
+ break;
+ }
+ }
+
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
+}
+
+/*
+ * __block_dump_avail --
+ * Dump out the avail list so we can see what compaction will look like.
+ */
+static int
+__block_dump_avail(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_EXTLIST *el;
+ WT_EXT *ext;
+ wt_off_t decile[10], percentile[100], size, v;
+ u_int i;
+
+ el = &block->live.avail;
+ size = block->fh->size;
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "file size %" PRIuMAX "MB (%" PRIuMAX ") with %" PRIuMAX
+ "%% space available %" PRIuMAX "MB (%" PRIuMAX ")",
+ (uintmax_t)size / WT_MEGABYTE, (uintmax_t)size,
+ ((uintmax_t)el->bytes * 100) / (uintmax_t)size,
+ (uintmax_t)el->bytes / WT_MEGABYTE, (uintmax_t)el->bytes));
+
+ if (el->entries == 0)
+ return (0);
+
+ /*
+ * Bucket the available memory into file deciles/percentiles. Large
+ * pieces of memory will cross over multiple buckets, assign to the
+ * decile/percentile in 512B chunks.
+ */
+ memset(decile, 0, sizeof(decile));
+ memset(percentile, 0, sizeof(percentile));
+ WT_EXT_FOREACH(ext, el->off)
+ for (i = 0; i < ext->size / 512; ++i) {
+ ++decile[((ext->off + i * 512) * 10) / size];
+ ++percentile[((ext->off + i * 512) * 100) / size];
+ }
+
+#ifdef __VERBOSE_OUTPUT_PERCENTILE
+ for (i = 0; i < WT_ELEMENTS(percentile); ++i) {
+ v = percentile[i] * 512;
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %"
+ PRIuMAX "%%)",
+ i, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v,
+ (uintmax_t)((v * 100) / (wt_off_t)el->bytes)));
+ }
+#endif
+ for (i = 0; i < WT_ELEMENTS(decile); ++i) {
+ v = decile[i] * 512;
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%2u%%: %12" PRIuMAX "MB, (%" PRIuMAX "B, %"
+ PRIuMAX "%%)",
+ i * 10, (uintmax_t)v / WT_MEGABYTE, (uintmax_t)v,
+ (uintmax_t)((v * 100) / (wt_off_t)el->bytes)));
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_ext.c b/src/third_party/wiredtiger/src/block/block_ext.c
new file mode 100644
index 00000000000..d500f93817a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_ext.c
@@ -0,0 +1,1437 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __block_append(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+static int __block_ext_overlap(WT_SESSION_IMPL *,
+ WT_BLOCK *, WT_EXTLIST *, WT_EXT **, WT_EXTLIST *, WT_EXT **);
+static int __block_extlist_dump(
+ WT_SESSION_IMPL *, const char *, WT_EXTLIST *, int);
+static int __block_merge(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+
+/*
+ * __block_off_srch_last --
+ * Return the last element in the list, along with a stack for appending.
+ */
+static inline WT_EXT *
+__block_off_srch_last(WT_EXT **head, WT_EXT ***stack)
+{
+ WT_EXT **extp, *last;
+ int i;
+
+ last = NULL; /* The list may be empty */
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL) {
+ last = *extp;
+ extp = &(*extp)->next[i];
+ } else
+ stack[i--] = extp--;
+ return (last);
+}
+
+/*
+ * __block_off_srch --
+ * Search a by-offset skiplist (either the primary by-offset list, or the
+ * by-offset list referenced by a size entry), for the specified offset.
+ */
+static inline void
+__block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off)
+{
+ WT_EXT **extp;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ *
+ * Return a stack for an exact match or the next-largest item.
+ *
+ * The WT_EXT structure contains two skiplists, the primary one and the
+ * per-size bucket one: if the skip_off flag is set, offset the skiplist
+ * array by the depth specified in this particular structure.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;)
+ if (*extp != NULL && (*extp)->off < off)
+ extp =
+ &(*extp)->next[i + (skip_off ? (*extp)->depth : 0)];
+ else
+ stack[i--] = extp--;
+}
+
+/*
+ * __block_first_srch --
+ * Search the skiplist for the first available slot.
+ */
+static inline int
+__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
+{
+ WT_EXT *ext;
+
+ /*
+ * Linear walk of the available chunks in offset order; take the first
+ * one that's large enough.
+ */
+ WT_EXT_FOREACH(ext, head)
+ if (ext->size >= size)
+ break;
+ if (ext == NULL)
+ return (0);
+
+ /* Build a stack for the offset we want. */
+ __block_off_srch(head, ext->off, stack, 0);
+ return (1);
+}
+
+/*
+ * __block_size_srch --
+ * Search the by-size skiplist for the specified size.
+ */
+static inline void
+__block_size_srch(WT_SIZE **head, wt_off_t size, WT_SIZE ***stack)
+{
+ WT_SIZE **szp;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ *
+ * Return a stack for an exact match or the next-largest item.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, szp = &head[i]; i >= 0;)
+ if (*szp != NULL && (*szp)->size < size)
+ szp = &(*szp)->next[i];
+ else
+ stack[i--] = szp--;
+}
+
+/*
+ * __block_off_srch_pair --
+ * Search a by-offset skiplist for before/after records of the specified
+ * offset.
+ */
+static inline void
+__block_off_srch_pair(
+ WT_EXTLIST *el, wt_off_t off, WT_EXT **beforep, WT_EXT **afterp)
+{
+ WT_EXT **head, **extp;
+ int i;
+
+ *beforep = *afterp = NULL;
+
+ head = el->off;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, extp = &head[i]; i >= 0;) {
+ if (*extp == NULL) {
+ --i;
+ --extp;
+ continue;
+ }
+
+ if ((*extp)->off < off) { /* Keep going at this level */
+ *beforep = *extp;
+ extp = &(*extp)->next[i];
+ } else { /* Drop down a level */
+ *afterp = *extp;
+ --i;
+ --extp;
+ }
+ }
+}
+
+/*
+ * __block_ext_insert --
+ * Insert an extent into an extent list.
+ */
+static int
+__block_ext_insert(WT_SESSION_IMPL *session, WT_EXTLIST *el, WT_EXT *ext)
+{
+ WT_EXT **astack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /*
+ * If we are inserting a new size onto the size skiplist, we'll need a
+ * new WT_SIZE structure for that skiplist.
+ */
+ if (el->track_size) {
+ __block_size_srch(el->sz, ext->size, sstack);
+ szp = *sstack[0];
+ if (szp == NULL || szp->size != ext->size) {
+ WT_RET(__wt_block_size_alloc(session, &szp));
+ szp->size = ext->size;
+ szp->depth = ext->depth;
+ for (i = 0; i < ext->depth; ++i) {
+ szp->next[i] = *sstack[i];
+ *sstack[i] = szp;
+ }
+ }
+
+ /*
+ * Insert the new WT_EXT structure into the size element's
+ * offset skiplist.
+ */
+ __block_off_srch(szp->off, ext->off, astack, 1);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i + ext->depth] = *astack[i];
+ *astack[i] = ext;
+ }
+ }
+#ifdef HAVE_DIAGNOSTIC
+ if (!el->track_size)
+ for (i = 0; i < ext->depth; ++i)
+ ext->next[i + ext->depth] = NULL;
+#endif
+
+ /* Insert the new WT_EXT structure into the offset skiplist. */
+ __block_off_srch(el->off, ext->off, astack, 0);
+ for (i = 0; i < ext->depth; ++i) {
+ ext->next[i] = *astack[i];
+ *astack[i] = ext;
+ }
+
+ ++el->entries;
+ el->bytes += (uint64_t)ext->size;
+
+ /* Update the cached end-of-list. */
+ if (ext->next[0] == NULL)
+ el->last = ext;
+
+ return (0);
+}
+
+/*
+ * __block_off_insert --
+ * Insert a file range into an extent list.
+ */
+static int
+__block_off_insert(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *ext;
+
+ WT_RET(__wt_block_ext_alloc(session, &ext));
+ ext->off = off;
+ ext->size = size;
+
+ return (__block_ext_insert(session, el, ext));
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __block_off_match --
+ * Return if any part of a specified range appears on a specified extent
+ * list.
+ */
+static int
+__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *before, *after;
+
+ /* Search for before and after entries for the offset. */
+ __block_off_srch_pair(el, off, &before, &after);
+
+ /* If "before" or "after" overlaps, we have a winner. */
+ if (before != NULL && before->off + before->size > off)
+ return (1);
+ if (after != NULL && off + size > after->off)
+ return (1);
+ return (0);
+}
+
+/*
+ * __wt_block_misplaced --
+ * Complain if a block appears on the available or discard lists.
+ */
+int
+__wt_block_misplaced(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live)
+{
+ const char *name;
+
+ name = NULL;
+
+ /*
+ * Don't check during the salvage read phase, we might be reading an
+ * already freed overflow page.
+ */
+ if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ return (0);
+
+ /*
+ * Verify a block the btree engine thinks it "owns" doesn't appear on
+ * the available or discard lists (it might reasonably be on the alloc
+ * list, if it was allocated since the last checkpoint). The engine
+ * "owns" a block if it's trying to read or free the block, and those
+ * functions make this check.
+ *
+ * Any block being read or freed should not be "available".
+ *
+ * Any block being read or freed in the live system should not be on the
+ * discard list. (A checkpoint handle might be reading a block which is
+ * on the live system's discard list; any attempt to free a block from a
+ * checkpoint handle has already failed.)
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ if (__block_off_match(&block->live.avail, offset, size))
+ name = "available";
+ else if (live && __block_off_match(&block->live.discard, offset, size))
+ name = "discard";
+ __wt_spin_unlock(session, &block->live_lock);
+ if (name != NULL) {
+ __wt_errx(session,
+ "%s failed: %" PRIuMAX "/%" PRIu32 " is on the %s list",
+ tag, (uintmax_t)offset, size, name);
+ return (__wt_panic(session));
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __block_off_remove --
+ * Remove a record from an extent list.
+ */
+static int
+__block_off_remove(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, WT_EXT **extp)
+{
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /* Find and remove the record from the by-offset skiplist. */
+ __block_off_srch(el->off, off, astack, 0);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
+ goto corrupt;
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i];
+
+ /*
+ * Find and remove the record from the size's offset skiplist; if that
+ * empties the by-size skiplist entry, remove it as well.
+ */
+ if (el->track_size) {
+ __block_size_srch(el->sz, ext->size, sstack);
+ szp = *sstack[0];
+ if (szp == NULL || szp->size != ext->size)
+ return (EINVAL);
+ __block_off_srch(szp->off, off, astack, 1);
+ ext = *astack[0];
+ if (ext == NULL || ext->off != off)
+ goto corrupt;
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext->next[i + ext->depth];
+ if (szp->off[0] == NULL) {
+ for (i = 0; i < szp->depth; ++i)
+ *sstack[i] = szp->next[i];
+ __wt_block_size_free(session, szp);
+ }
+ }
+#ifdef HAVE_DIAGNOSTIC
+ if (!el->track_size) {
+ int not_null;
+ for (i = 0, not_null = 0; i < ext->depth; ++i)
+ if (ext->next[i + ext->depth] != NULL)
+ not_null = 1;
+ WT_ASSERT(session, not_null == 0);
+ }
+#endif
+
+ --el->entries;
+ el->bytes -= (uint64_t)ext->size;
+
+ /* Return the record if our caller wants it, otherwise free it. */
+ if (extp == NULL)
+ __wt_block_ext_free(session, ext);
+ else
+ *extp = ext;
+
+ /* Update the cached end-of-list. */
+ if (el->last == ext)
+ el->last = NULL;
+
+ return (0);
+
+corrupt:
+ WT_PANIC_RET(session, EINVAL,
+ "attempt to remove non-existent offset from an extent list");
+}
+
+/*
+ * __wt_block_off_remove_overlap --
+ * Remove a range from an extent list, where the range may be part of a
+ * overlapping entry.
+ */
+int
+__wt_block_off_remove_overlap(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *before, *after, *ext;
+ wt_off_t a_off, a_size, b_off, b_size;
+
+ WT_ASSERT(session, off != WT_BLOCK_INVALID_OFFSET);
+
+ /* Search for before and after entries for the offset. */
+ __block_off_srch_pair(el, off, &before, &after);
+
+ /* If "before" or "after" overlaps, retrieve the overlapping entry. */
+ if (before != NULL && before->off + before->size > off) {
+ WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+ /* Calculate overlapping extents. */
+ a_off = ext->off;
+ a_size = off - ext->off;
+ b_off = off + size;
+ b_size = ext->size - (a_size + size);
+ } else if (after != NULL && off + size > after->off) {
+ WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+ /*
+ * Calculate overlapping extents. There's no initial overlap
+ * since the after extent presumably cannot begin before "off".
+ */
+ a_off = WT_BLOCK_INVALID_OFFSET;
+ a_size = 0;
+ b_off = off + size;
+ b_size = ext->size - (b_off - ext->off);
+ } else
+ return (WT_NOTFOUND);
+
+ /*
+ * If there are overlaps, insert the item; re-use the extent structure
+ * and save the allocation (we know there's no need to merge).
+ */
+ if (a_size != 0) {
+ ext->off = a_off;
+ ext->size = a_size;
+ WT_RET(__block_ext_insert(session, el, ext));
+ ext = NULL;
+ }
+ if (b_size != 0) {
+ if (ext == NULL)
+ WT_RET(__block_off_insert(session, el, b_off, b_size));
+ else {
+ ext->off = b_off;
+ ext->size = b_size;
+ WT_RET(__block_ext_insert(session, el, ext));
+ ext = NULL;
+ }
+ }
+ if (ext != NULL)
+ __wt_block_ext_free(session, ext);
+ return (0);
+}
+
+/*
+ * __block_extend --
+ * Extend the file to allocate space.
+ */
+static inline int
+__block_extend(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+{
+ WT_FH *fh;
+
+ fh = block->fh;
+
+ /*
+ * Callers of this function are expected to have already acquired any
+ * locks required to extend the file.
+ *
+ * We should never be allocating from an empty file.
+ */
+ if (fh->size < block->allocsize)
+ WT_RET_MSG(session, EINVAL,
+ "file has no description information");
+
+ /*
+ * Make sure we don't allocate past the maximum file size. There's no
+ * easy way to know the maximum wt_off_t on a system, limit growth to
+ * 8B bits (we currently check an wt_off_t is 8B in verify_build.h). I
+ * don't think we're likely to see anything bigger for awhile.
+ */
+ if (fh->size > (wt_off_t)INT64_MAX - size)
+ WT_RET_MSG(session, WT_ERROR,
+ "block allocation failed, file cannot grow further");
+
+ *offp = fh->size;
+ fh->size += size;
+
+ WT_STAT_FAST_DATA_INCR(session, block_extension);
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "file extend %" PRIdMAX "B @ %" PRIdMAX,
+ (intmax_t)size, (intmax_t)*offp));
+
+ return (0);
+}
+
+/*
+ * __wt_block_alloc --
+ * Alloc a chunk of space from the underlying file.
+ */
+int
+__wt_block_alloc(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size)
+{
+ WT_EXT *ext, **estack[WT_SKIP_MAXDEPTH];
+ WT_SIZE *szp, **sstack[WT_SKIP_MAXDEPTH];
+
+ /* Assert we're maintaining the by-size skiplist. */
+ WT_ASSERT(session, block->live.avail.track_size != 0);
+
+ WT_STAT_FAST_DATA_INCR(session, block_alloc);
+ if (size % block->allocsize != 0)
+ WT_RET_MSG(session, EINVAL,
+ "cannot allocate a block size %" PRIdMAX " that is not "
+ "a multiple of the allocation size %" PRIu32,
+ (intmax_t)size, block->allocsize);
+
+ /*
+ * Allocation is either first-fit (lowest offset), or best-fit (best
+ * size). If it's first-fit, walk the offset list linearly until we
+ * find an entry that will work.
+ *
+ * If it's best-fit by size, search the by-size skiplist for the size
+ * and take the first entry on the by-size offset list. This means we
+ * prefer best-fit over lower offset, but within a size we'll prefer an
+ * offset appearing earlier in the file.
+ *
+ * If we don't have anything big enough, extend the file.
+ */
+ if (block->live.avail.bytes < (uint64_t)size)
+ goto append;
+ if (block->allocfirst) {
+ if (!__block_first_srch(block->live.avail.off, size, estack))
+ goto append;
+ ext = *estack[0];
+ } else {
+ __block_size_srch(block->live.avail.sz, size, sstack);
+ if ((szp = *sstack[0]) == NULL) {
+append: WT_RET(__block_extend(session, block, offp, size));
+ WT_RET(__block_append(session,
+ &block->live.alloc, *offp, (wt_off_t)size));
+ return (0);
+ }
+
+ /* Take the first record. */
+ ext = szp->off[0];
+ }
+
+ /* Remove the record, and set the returned offset. */
+ WT_RET(__block_off_remove(session, &block->live.avail, ext->off, &ext));
+ *offp = ext->off;
+
+ /* If doing a partial allocation, adjust the record and put it back. */
+ if (ext->size > size) {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "allocate %" PRIdMAX " from range %" PRIdMAX "-%"
+ PRIdMAX ", range shrinks to %" PRIdMAX "-%" PRIdMAX,
+ (intmax_t)size,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)(ext->off + size),
+ (intmax_t)(ext->off + size + ext->size - size)));
+
+ ext->off += size;
+ ext->size -= size;
+ WT_RET(__block_ext_insert(session, &block->live.avail, ext));
+ } else {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "allocate range %" PRIdMAX "-%" PRIdMAX,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size)));
+
+ __wt_block_ext_free(session, ext);
+ }
+
+ /* Add the newly allocated extent to the list of allocations. */
+ WT_RET(__block_merge(
+ session, &block->live.alloc, *offp, (wt_off_t)size));
+ return (0);
+}
+
+/*
+ * __wt_block_free --
+ * Free a cookie-referenced chunk of space to the underlying file.
+ */
+int
+__wt_block_free(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+{
+ WT_DECL_RET;
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+ WT_STAT_FAST_DATA_INCR(session, block_free);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "free %" PRIdMAX "/%" PRIdMAX, (intmax_t)offset, (intmax_t)size));
+
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(__wt_block_misplaced(session, block, "free", offset, size, 1));
+#endif
+ WT_RET(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ ret = __wt_block_off_free(session, block, offset, (wt_off_t)size);
+ __wt_spin_unlock(session, &block->live_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_off_free --
+ * Free a file range to the underlying file.
+ */
+int
+__wt_block_off_free(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+{
+ WT_DECL_RET;
+
+ /*
+ * Callers of this function are expected to have already acquired any
+ * locks required to manipulate the extent lists.
+ *
+ * We can reuse this extent immediately if it was allocated during this
+ * checkpoint, merge it into the avail list (which slows file growth in
+ * workloads including repeated overflow record modification). If this
+ * extent is referenced in a previous checkpoint, merge into the discard
+ * list.
+ */
+ if ((ret = __wt_block_off_remove_overlap(
+ session, &block->live.alloc, offset, size)) == 0)
+ ret = __block_merge(
+ session, &block->live.avail, offset, (wt_off_t)size);
+ else if (ret == WT_NOTFOUND)
+ ret = __block_merge(
+ session, &block->live.discard, offset, (wt_off_t)size);
+ return (ret);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_extlist_check --
+ * Return if the extent lists overlap.
+ */
+int
+__wt_block_extlist_check(
+ WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl)
+{
+ WT_EXT *a, *b;
+
+ a = al->off[0];
+ b = bl->off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (a != NULL && b != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the
+ * next entry in its list.
+ */
+ if (a->off + a->size <= b->off) {
+ a = a->next[0];
+ continue;
+ }
+ if (b->off + b->size <= a->off) {
+ b = b->next[0];
+ continue;
+ }
+ WT_PANIC_RET(session, EINVAL,
+ "checkpoint merge check: %s list overlaps the %s list",
+ al->name, bl->name);
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __wt_block_extlist_overlap --
+ * Review a checkpoint's alloc/discard extent lists, move overlaps into the
+ * live system's checkpoint-avail list.
+ */
+int
+__wt_block_extlist_overlap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+{
+ WT_EXT *alloc, *discard;
+
+ alloc = ci->alloc.off[0];
+ discard = ci->discard.off[0];
+
+ /* Walk the lists in parallel, looking for overlaps. */
+ while (alloc != NULL && discard != NULL) {
+ /*
+ * If there's no overlap, move the lower-offset entry to the
+ * next entry in its list.
+ */
+ if (alloc->off + alloc->size <= discard->off) {
+ alloc = alloc->next[0];
+ continue;
+ }
+ if (discard->off + discard->size <= alloc->off) {
+ discard = discard->next[0];
+ continue;
+ }
+
+ /* Reconcile the overlap. */
+ WT_RET(__block_ext_overlap(session, block,
+ &ci->alloc, &alloc, &ci->discard, &discard));
+ }
+ return (0);
+}
+
+/*
+ * __block_ext_overlap --
+ * Reconcile two overlapping ranges.
+ */
+static int
+__block_ext_overlap(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *ael, WT_EXT **ap, WT_EXTLIST *bel, WT_EXT **bp)
+{
+ WT_EXT *a, *b, **ext;
+ WT_EXTLIST *avail, *el;
+ wt_off_t off, size;
+
+ avail = &block->live.ckpt_avail;
+
+ /*
+ * The ranges overlap, choose the range we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ *
+ * By swapping the arguments so "A" is always the lower range, we can
+ * eliminate cases #2, #8, #10 and #11, and only handle 7 cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB ranges are the same
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #9 AAAAA A is a prefix of B
+ */
+ a = *ap;
+ b = *bp;
+ if (a->off > b->off) { /* Swap */
+ b = *ap;
+ a = *bp;
+ ext = ap; ap = bp; bp = ext;
+ el = ael; ael = bel; bel = el;
+ }
+
+ if (a->off == b->off) { /* Case #1, #4, #9 */
+ if (a->size == b->size) { /* Case #1 */
+ /*
+ * Move caller's A and B to the next element
+ * Add that A and B range to the avail list
+ * Delete A and B
+ */
+ *ap = (*ap)->next[0];
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ }
+ else if (a->size > b->size) { /* Case #4 */
+ /*
+ * Remove A from its list
+ * Increment/Decrement A's offset/size by the size of B
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->off += b->size;
+ a->size -= b->size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ } else { /* Case #9 */
+ /*
+ * Remove B from its list
+ * Increment/Decrement B's offset/size by the size of A
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, bel, b->off, &b));
+ b->off += a->size;
+ b->size -= a->size;
+ WT_RET(__block_ext_insert(session, bel, b));
+
+ /*
+ * Move caller's A to the next element
+ * Add A's range to the avail list
+ * Delete A
+ */
+ *ap = (*ap)->next[0];
+ WT_RET(__block_merge(session, avail, a->off, a->size));
+ WT_RET(__block_off_remove(session, ael, a->off, NULL));
+ } /* Case #6 */
+ } else if (a->off + a->size == b->off + b->size) {
+ /*
+ * Remove A from its list
+ * Decrement A's size by the size of B
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size -= b->size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ } else if /* Case #3, #7 */
+ (a->off + a->size < b->off + b->size) {
+ /*
+ * Add overlap to the avail list
+ */
+ off = b->off;
+ size = (a->off + a->size) - b->off;
+ WT_RET(__block_merge(session, avail, off, size));
+
+ /*
+ * Remove A from its list
+ * Decrement A's size by the overlap
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size -= size;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /*
+ * Remove B from its list
+ * Increment/Decrement B's offset/size by the overlap
+ * Insert B on its list
+ */
+ WT_RET(__block_off_remove(session, bel, b->off, &b));
+ b->off += size;
+ b->size -= size;
+ WT_RET(__block_ext_insert(session, bel, b));
+ } else { /* Case #5 */
+ /* Calculate the offset/size of the trailing part of A. */
+ off = b->off + b->size;
+ size = (a->off + a->size) - off;
+
+ /*
+ * Remove A from its list
+ * Decrement A's size by trailing part of A plus B's size
+ * Insert A on its list
+ */
+ WT_RET(__block_off_remove(session, ael, a->off, &a));
+ a->size = b->off - a->off;
+ WT_RET(__block_ext_insert(session, ael, a));
+
+ /* Add trailing part of A to A's list as a new element. */
+ WT_RET(__block_merge(session, ael, off, size));
+
+ /*
+ * Move caller's B to the next element
+ * Add B's range to the avail list
+ * Delete B
+ */
+ *bp = (*bp)->next[0];
+ WT_RET(__block_merge(session, avail, b->off, b->size));
+ WT_RET(__block_off_remove(session, bel, b->off, NULL));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_merge --
+ * Merge one extent list into another.
+ */
+int
+__wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b)
+{
+ WT_EXT *ext;
+ WT_EXTLIST tmp;
+ u_int i;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_BLOCK, "merging %s into %s", a->name, b->name));
+
+ /*
+ * Sometimes the list we are merging is much bigger than the other: if
+ * so, swap the lists around to reduce the amount of work we need to do
+ * during the merge. The size lists have to match as well, so this is
+ * only possible if both lists are tracking sizes, or neither are.
+ */
+ if (a->track_size == b->track_size && a->entries > b->entries) {
+ tmp = *a;
+ a->bytes = b->bytes;
+ b->bytes = tmp.bytes;
+ a->entries = b->entries;
+ b->entries = tmp.entries;
+ for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+ a->off[i] = b->off[i];
+ b->off[i] = tmp.off[i];
+ a->sz[i] = b->sz[i];
+ b->sz[i] = tmp.sz[i];
+ }
+ }
+
+ WT_EXT_FOREACH(ext, a->off)
+ WT_RET(__block_merge(session, b, ext->off, ext->size));
+
+ return (0);
+}
+
+/*
+ * __block_append --
+ * Append a new entry to the allocation list.
+ */
+static int
+__block_append(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ WT_ASSERT(session, el->track_size == 0);
+
+ /*
+ * Identical to __block_merge, when we know the file is being extended,
+ * that is, the information is either going to be used to extend the
+ * last object on the list, or become a new object ending the list.
+ *
+ * The terminating element of the list is cached, check it; otherwise,
+ * get a stack for the last object in the skiplist, check for a simple
+ * extension, and otherwise append a new structure.
+ */
+ if ((ext = el->last) != NULL && ext->off + ext->size == off)
+ ext->size += size;
+ else {
+ ext = __block_off_srch_last(el->off, astack);
+ if (ext != NULL && ext->off + ext->size == off)
+ ext->size += size;
+ else {
+ WT_RET(__wt_block_ext_alloc(session, &ext));
+ ext->off = off;
+ ext->size = size;
+
+ for (i = 0; i < ext->depth; ++i)
+ *astack[i] = ext;
+ ++el->entries;
+ }
+
+ /* Update the cached end-of-list */
+ el->last = ext;
+ }
+ el->bytes += (uint64_t)size;
+
+ return (0);
+}
+
+/*
+ * __wt_block_insert_ext --
+ * Insert an extent into an extent list, merging if possible.
+ */
+int
+__wt_block_insert_ext(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ /*
+ * There are currently two copies of this function (this code is a one-
+ * liner that calls the internal version of the function, which means
+ * the compiler should compress out the function call). It's that way
+ * because the interface is still fluid, I'm not convinced there won't
+ * be a need for a functional split between the internal and external
+ * versions in the future.
+ *
+ * Callers of this function are expected to have already acquired any
+ * locks required to manipulate the extent list.
+ */
+ return (__block_merge(session, el, off, size));
+}
+
+/*
+ * __block_merge --
+ * Insert an extent into an extent list, merging if possible (internal
+ * version).
+ */
+static int
+__block_merge(
+ WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size)
+{
+ WT_EXT *ext, *after, *before;
+
+ /*
+ * Retrieve the records preceding/following the offset. If the records
+ * are contiguous with the free'd offset, combine records.
+ */
+ __block_off_srch_pair(el, off, &before, &after);
+ if (before != NULL) {
+ if (before->off + before->size > off)
+ WT_PANIC_RET(session, EINVAL,
+ "%s: existing range %" PRIdMAX "-%" PRIdMAX
+ " overlaps with merge range %" PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)before->off,
+ (intmax_t)(before->off + before->size),
+ (intmax_t)off, (intmax_t)(off + size));
+ if (before->off + before->size != off)
+ before = NULL;
+ }
+ if (after != NULL) {
+ if (off + size > after->off)
+ WT_PANIC_RET(session, EINVAL,
+ "%s: merge range %" PRIdMAX "-%" PRIdMAX
+ " overlaps with existing range %" PRIdMAX
+ "-%" PRIdMAX,
+ el->name,
+ (intmax_t)off, (intmax_t)(off + size),
+ (intmax_t)after->off,
+ (intmax_t)(after->off + after->size));
+ if (off + size != after->off)
+ after = NULL;
+ }
+ if (before == NULL && after == NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: insert range %" PRIdMAX "-%" PRIdMAX,
+ el->name, (intmax_t)off, (intmax_t)(off + size)));
+
+ return (__block_off_insert(session, el, off, size));
+ }
+
+ /*
+ * If the "before" offset range abuts, we'll use it as our new record;
+ * if the "after" offset range also abuts, include its size and remove
+ * it from the system. Else, only the "after" offset range abuts, use
+ * the "after" offset range as our new record. In either case, remove
+ * the record we're going to use, adjust it and re-insert it.
+ */
+ if (before == NULL) {
+ WT_RET(__block_off_remove(session, el, after->off, &ext));
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+ PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)off, (intmax_t)(off + ext->size + size)));
+
+ ext->off = off;
+ ext->size += size;
+ } else {
+ if (after != NULL) {
+ size += after->size;
+ WT_RET(
+ __block_off_remove(session, el, after->off, NULL));
+ }
+ WT_RET(__block_off_remove(session, el, before->off, &ext));
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: range grows from %" PRIdMAX "-%" PRIdMAX ", to %"
+ PRIdMAX "-%" PRIdMAX,
+ el->name,
+ (intmax_t)ext->off, (intmax_t)(ext->off + ext->size),
+ (intmax_t)ext->off,
+ (intmax_t)(ext->off + ext->size + size)));
+
+ ext->size += size;
+ }
+ return (__block_ext_insert(session, el, ext));
+}
+
+/*
+ * __wt_block_extlist_read_avail --
+ * Read an avail extent list, includes minor special handling.
+ */
+int
+__wt_block_extlist_read_avail(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+{
+ WT_DECL_RET;
+
+ /* If there isn't a list, we're done. */
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * In diagnostic mode, reads are checked against the available and
+ * discard lists (a block being read should never appear on either).
+ * Checkpoint threads may be running in the file, don't race with
+ * them.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+#endif
+
+ WT_ERR(__wt_block_extlist_read(session, block, el, ckpt_size));
+
+ /*
+ * Extent blocks are allocated from the available list: if reading the
+ * avail list, the extent blocks might be included, remove them.
+ */
+ WT_ERR_NOTFOUND_OK(
+ __wt_block_off_remove_overlap(session, el, el->offset, el->size));
+
+err:
+#ifdef HAVE_DIAGNOSTIC
+ __wt_spin_unlock(session, &block->live_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * __wt_block_extlist_read --
+ * Read an extent list.
+ */
+int
+__wt_block_extlist_read(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ wt_off_t off, size;
+ int (*func)(WT_SESSION_IMPL *, WT_EXTLIST *, wt_off_t, wt_off_t);
+ const uint8_t *p;
+
+ /* If there isn't a list, we're done. */
+ if (el->offset == WT_BLOCK_INVALID_OFFSET)
+ return (0);
+
+ WT_RET(__wt_scr_alloc(session, el->size, &tmp));
+ WT_ERR(__wt_block_read_off(
+ session, block, tmp, el->offset, el->size, el->cksum));
+
+#define WT_EXTLIST_READ(p, v) do { \
+ uint64_t _v; \
+ WT_ERR(__wt_vunpack_uint(&(p), 0, &_v)); \
+ (v) = (wt_off_t)_v; \
+} while (0)
+
+ p = WT_BLOCK_HEADER_BYTE(tmp->mem);
+ WT_EXTLIST_READ(p, off);
+ WT_EXTLIST_READ(p, size);
+ if (off != WT_BLOCK_EXTLIST_MAGIC || size != 0)
+ goto corrupted;
+
+ /*
+ * If we're not creating both offset and size skiplists, use the simpler
+ * append API, otherwise do a full merge. There are two reasons for the
+ * test: first, checkpoint "available" lists are NOT sorted (checkpoints
+ * write two separate lists, both of which are sorted but they're not
+ * merged). Second, the "available" list is sorted by size as well as
+ * by offset, and the fast-path append code doesn't support that, it's
+ * limited to offset. The test of "track size" is short-hand for "are
+ * we reading the "available" list.
+ */
+ func = el->track_size == 0 ? __block_append : __block_merge;
+ for (;;) {
+ WT_EXTLIST_READ(p, off);
+ WT_EXTLIST_READ(p, size);
+ if (off == WT_BLOCK_INVALID_OFFSET)
+ break;
+
+ /*
+ * We check the offset/size pairs represent valid file ranges,
+ * then insert them into the list. We don't necessarily have
+ * to check for offsets past the end of the checkpoint, but it's
+ * a cheap test to do here and we'd have to do the check as part
+ * of file verification, regardless.
+ */
+ if (off < block->allocsize ||
+ off % block->allocsize != 0 ||
+ size % block->allocsize != 0 ||
+ off + size > ckpt_size)
+corrupted: WT_PANIC_RET(session, WT_ERROR,
+ "file contains a corrupted %s extent list, range %"
+ PRIdMAX "-%" PRIdMAX " past end-of-file",
+ el->name,
+ (intmax_t)off, (intmax_t)(off + size));
+
+ WT_ERR(func(session, el, off, size));
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+ WT_ERR(__block_extlist_dump(session, "read extlist", el, 0));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_extlist_write --
+ * Write an extent list at the tail of the file.
+ */
+int
+__wt_block_extlist_write(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_PAGE_HEADER *dsk;
+ size_t size;
+ uint32_t entries;
+ uint8_t *p;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_BLOCK))
+ WT_RET(__block_extlist_dump(session, "write extlist", el, 0));
+
+ /*
+ * Figure out how many entries we're writing -- if there aren't any
+ * entries, we're done.
+ */
+ entries = el->entries + (additional == NULL ? 0 : additional->entries);
+ if (entries == 0) {
+ el->offset = WT_BLOCK_INVALID_OFFSET;
+ el->cksum = el->size = 0;
+ return (0);
+ }
+
+ /*
+ * Get a scratch buffer, clear the page's header and data, initialize
+ * the header.
+ *
+ * Allocate memory for the extent list entries plus two additional
+ * entries: the initial WT_BLOCK_EXTLIST_MAGIC/0 pair and the list-
+ * terminating WT_BLOCK_INVALID_OFFSET/0 pair.
+ */
+ size = (entries + 2) * 2 * WT_INTPACK64_MAXSIZE;
+ WT_RET(__wt_block_write_size(session, block, &size));
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ dsk = tmp->mem;
+ memset(dsk, 0, WT_BLOCK_HEADER_BYTE_SIZE);
+ dsk->type = WT_PAGE_BLOCK_MANAGER;
+
+#define WT_EXTLIST_WRITE(p, v) \
+ WT_ERR(__wt_vpack_uint(&(p), 0, (uint64_t)(v)))
+
+ /* Fill the page's data. */
+ p = WT_BLOCK_HEADER_BYTE(dsk);
+ WT_EXTLIST_WRITE(p, WT_BLOCK_EXTLIST_MAGIC); /* Initial value */
+ WT_EXTLIST_WRITE(p, 0);
+ WT_EXT_FOREACH(ext, el->off) { /* Free ranges */
+ WT_EXTLIST_WRITE(p, ext->off);
+ WT_EXTLIST_WRITE(p, ext->size);
+ }
+ if (additional != NULL)
+ WT_EXT_FOREACH(ext, additional->off) { /* Free ranges */
+ WT_EXTLIST_WRITE(p, ext->off);
+ WT_EXTLIST_WRITE(p, ext->size);
+ }
+ WT_EXTLIST_WRITE(p, WT_BLOCK_INVALID_OFFSET); /* Ending value */
+ WT_EXTLIST_WRITE(p, 0);
+
+ dsk->u.datalen = WT_PTRDIFF32(p, WT_BLOCK_HEADER_BYTE(dsk));
+ tmp->size = dsk->mem_size = WT_PTRDIFF32(p, dsk);
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * The extent list is written as a valid btree page because the salvage
+ * functionality might move into the btree layer some day, besides, we
+ * don't need another format and this way the page format can be easily
+ * verified.
+ */
+ WT_ERR(__wt_verify_dsk(session, "[extent list check]", tmp));
+#endif
+
+ /* Write the extent list to disk. */
+ WT_ERR(__wt_block_write_off(
+ session, block, tmp, &el->offset, &el->size, &el->cksum, 1, 1));
+
+ /*
+ * Remove the allocated blocks from the system's allocation list, extent
+ * blocks never appear on any allocation list.
+ */
+ WT_TRET(__wt_block_off_remove_overlap(
+ session, &block->live.alloc, el->offset, el->size));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s written %" PRIdMAX "/%" PRIu32,
+ el->name, (intmax_t)el->offset, el->size));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_extlist_truncate --
+ * Truncate the file based on the last available extent in the list.
+ */
+int
+__wt_block_extlist_truncate(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
+{
+ WT_EXT *ext, **astack[WT_SKIP_MAXDEPTH];
+ WT_FH *fh;
+ wt_off_t orig, size;
+
+ fh = block->fh;
+
+ /*
+ * Check if the last available extent is at the end of the file, and if
+ * so, truncate the file and discard the extent.
+ */
+ if ((ext = __block_off_srch_last(el->off, astack)) == NULL)
+ return (0);
+ WT_ASSERT(session, ext->off + ext->size <= fh->size);
+ if (ext->off + ext->size < fh->size)
+ return (0);
+
+ /*
+ * Remove the extent list entry. (Save the value, we need it to reset
+ * the cached file size, and that can't happen until after the extent
+ * list removal succeeds.)
+ */
+ orig = fh->size;
+ size = ext->off;
+ WT_RET(__block_off_remove(session, el, size, NULL));
+ fh->size = size;
+
+ /*
+ * Truncate the file. The truncate might fail if there's a file mapping
+ * (if there's an open checkpoint on the file), that's OK, we'll ignore
+ * those blocks.
+ */
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "truncate file from %" PRIdMAX " to %" PRIdMAX,
+ (intmax_t)orig, (intmax_t)size));
+ WT_RET_BUSY_OK(__wt_ftruncate(session, block->fh, size));
+
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_init --
+ * Initialize an extent list.
+ */
+int
+__wt_block_extlist_init(WT_SESSION_IMPL *session,
+ WT_EXTLIST *el, const char *name, const char *extname, int track_size)
+{
+ size_t size;
+
+ WT_CLEAR(*el);
+
+ size = (name == NULL ? 0 : strlen(name)) +
+ strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1);
+ WT_RET(__wt_calloc_def(session, size, &el->name));
+ (void)snprintf(el->name, size, "%s.%s",
+ name == NULL ? "" : name, extname == NULL ? "" : extname);
+
+ el->offset = WT_BLOCK_INVALID_OFFSET;
+ el->track_size = track_size;
+ return (0);
+}
+
+/*
+ * __wt_block_extlist_free --
+ * Discard an extent list.
+ */
+void
+__wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el)
+{
+ WT_EXT *ext, *next;
+ WT_SIZE *szp, *nszp;
+
+ __wt_free(session, el->name);
+
+ for (ext = el->off[0]; ext != NULL; ext = next) {
+ next = ext->next[0];
+ __wt_free(session, ext);
+ }
+ for (szp = el->sz[0]; szp != NULL; szp = nszp) {
+ nszp = szp->next[0];
+ __wt_free(session, szp);
+ }
+
+ /* Extent lists are re-used, clear them. */
+ WT_CLEAR(*el);
+}
+
+/*
+ * __block_extlist_dump --
+ * Dump an extent list as verbose messages.
+ */
+static int
+__block_extlist_dump(
+ WT_SESSION_IMPL *session, const char *tag, WT_EXTLIST *el, int show_size)
+{
+ WT_EXT *ext;
+ WT_SIZE *szp;
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: %s: %" PRIu64 " bytes, by offset:%s",
+ tag, el->name, el->bytes, el->entries == 0 ? " [Empty]" : ""));
+ if (el->entries == 0)
+ return (0);
+
+ WT_EXT_FOREACH(ext, el->off)
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "\t{%" PRIuMAX "/%" PRIuMAX "}",
+ (uintmax_t)ext->off, (uintmax_t)ext->size));
+
+ if (!show_size)
+ return (0);
+
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: %s: by size:%s",
+ tag, el->name, el->entries == 0 ? " [Empty]" : ""));
+ if (el->entries == 0)
+ return (0);
+
+ WT_EXT_FOREACH(szp, el->sz) {
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "\t{%" PRIuMAX "}", (uintmax_t)szp->size));
+ WT_EXT_FOREACH_OFF(ext, szp->off)
+ WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
+ "\t\t{%" PRIuMAX "/%" PRIuMAX "}",
+ (uintmax_t)ext->off, (uintmax_t)ext->size));
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_map.c b/src/third_party/wiredtiger/src/block/block_map.c
new file mode 100644
index 00000000000..68fb75179d9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_map.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_map --
+ * Map a segment of the file in, if possible.
+ */
+int
+__wt_block_map(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp,
+ void **mappingcookie)
+{
+ *(void **)mapp = NULL;
+ *maplenp = 0;
+
+ /*
+ * Turn off mapping when verifying the file, because we can't perform
+ * checksum validation of mapped segments, and verify has to checksum
+ * pages.
+ */
+ if (block->verify)
+ return (0);
+
+ /*
+ * Turn off mapping when direct I/O is configured for the file, the
+ * Linux open(2) documentation says applications should avoid mixing
+ * mmap(2) of files with direct I/O to the same files.
+ */
+ if (block->fh->direct_io)
+ return (0);
+
+ /*
+ * Turn off mapping if the application configured a cache size maximum,
+ * we can't control how much of the cache size we use in that case.
+ */
+ if (block->os_cache_max != 0)
+ return (0);
+
+ /*
+ * Map the file into memory.
+ * Ignore errors, we'll read the file through the cache if map fails.
+ */
+ (void)__wt_mmap(session, block->fh, mapp, maplenp, mappingcookie);
+
+ return (0);
+}
+
+/*
+ * __wt_block_unmap --
+ * Unmap any mapped-in segment of the file.
+ */
+int
+__wt_block_unmap(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen,
+ void **mappingcookie)
+{
+ /* Unmap the file from memory. */
+ return (__wt_munmap(session, block->fh, map, maplen, mappingcookie));
+}
diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c
new file mode 100644
index 00000000000..4f7f2898de5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_mgr.c
@@ -0,0 +1,433 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __bm_method_set(WT_BM *, int);
+
+/*
+ * __bm_readonly --
+ * General-purpose "writes not supported on this handle" function.
+ */
+static int
+__bm_readonly(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_RET_MSG(session, ENOTSUP,
+ "%s: write operation on read-only checkpoint handle",
+ bm->block->name);
+}
+
+/*
+ * __bm_addr_string --
+ * Return a printable string representation of an address cookie.
+ */
+static int
+__bm_addr_string(WT_BM *bm, WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ return (
+ __wt_block_addr_string(session, bm->block, buf, addr, addr_size));
+}
+
+/*
+ * __bm_addr_valid --
+ * Return if an address cookie is valid.
+ */
+static int
+__bm_addr_valid(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ return (__wt_block_addr_valid(
+ session, bm->block, addr, addr_size, bm->is_live));
+}
+
+/*
+ * __bm_block_header --
+ * Return the size of the block header.
+ */
+static u_int
+__bm_block_header(WT_BM *bm)
+{
+ return (__wt_block_header(bm->block));
+}
+
+/*
+ * __bm_checkpoint --
+ * Write a buffer into a block, creating a checkpoint.
+ */
+static int
+__bm_checkpoint(WT_BM *bm,
+ WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum)
+{
+ return (__wt_block_checkpoint(
+ session, bm->block, buf, ckptbase, data_cksum));
+}
+
+/*
+ * __bm_sync --
+ * Flush a file to disk.
+ */
+static int
+__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, int async)
+{
+ return (async ?
+ __wt_fsync_async(session, bm->block->fh) :
+ __wt_fsync(session, bm->block->fh));
+}
+
+/*
+ * __bm_checkpoint_load --
+ * Load a checkpoint.
+ */
+static int
+__bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size,
+ uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* If not opening a checkpoint, we're opening the live system. */
+ bm->is_live = !checkpoint;
+ WT_RET(__wt_block_checkpoint_load(session, bm->block,
+ addr, addr_size, root_addr, root_addr_sizep, checkpoint));
+
+ if (checkpoint) {
+ /*
+ * Read-only objects are optionally mapped into memory instead
+ * of being read into cache buffers.
+ */
+ if (conn->mmap)
+ WT_RET(__wt_block_map(session, bm->block,
+ &bm->map, &bm->maplen, &bm->mappingcookie));
+
+ /*
+ * If this handle is for a checkpoint, that is, read-only, there
+ * isn't a lot you can do with it. Although the btree layer
+ * prevents attempts to write a checkpoint reference, paranoia
+ * is healthy.
+ */
+ __bm_method_set(bm, 1);
+ }
+
+ return (0);
+}
+
+/*
+ * __bm_checkpoint_resolve --
+ * Resolve the checkpoint.
+ */
+static int
+__bm_checkpoint_resolve(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_checkpoint_resolve(session, bm->block));
+}
+
+/*
+ * __bm_checkpoint_unload --
+ * Unload a checkpoint point.
+ */
+static int
+__bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ /* Unmap any mapped segment. */
+ if (bm->map != NULL)
+ WT_TRET(__wt_block_unmap(session,
+ bm->block, bm->map, bm->maplen, &bm->mappingcookie));
+
+ /* Unload the checkpoint. */
+ WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live));
+
+ return (ret);
+}
+
+/*
+ * __bm_close --
+ * Close a file.
+ */
+static int
+__bm_close(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ if (bm == NULL) /* Safety check */
+ return (0);
+
+ ret = __wt_block_close(session, bm->block);
+
+ __wt_overwrite_and_free(session, bm);
+ return (ret);
+}
+
+/*
+ * __bm_compact_start --
+ * Start a block manager compaction.
+ */
+static int
+__bm_compact_start(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_compact_start(session, bm->block));
+}
+
+/*
+ * __bm_compact_page_skip --
+ * Return if a page is useful for compaction.
+ */
+static int
+__bm_compact_page_skip(WT_BM *bm, WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, int *skipp)
+{
+ return (__wt_block_compact_page_skip(
+ session, bm->block, addr, addr_size, skipp));
+}
+
+/*
+ * __bm_compact_skip --
+ * Return if a file can be compacted.
+ */
+static int
+__bm_compact_skip(WT_BM *bm, WT_SESSION_IMPL *session, int *skipp)
+{
+ return (__wt_block_compact_skip(session, bm->block, skipp));
+}
+
+/*
+ * __bm_compact_end --
+ * End a block manager compaction.
+ */
+static int
+__bm_compact_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_compact_end(session, bm->block));
+}
+
+/*
+ * __bm_free --
+ * Free a block of space to the underlying file.
+ */
+static int
+__bm_free(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ return (__wt_block_free(session, bm->block, addr, addr_size));
+}
+
+/*
+ * __bm_stat --
+ * Block-manager statistics.
+ */
+static int
+__bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats)
+{
+ __wt_block_stat(session, bm->block, stats);
+ return (0);
+}
+
+/*
+ * __bm_write --
+ * Write a buffer into a block, returning the block's address cookie.
+ */
+static int
+__bm_write(WT_BM *bm, WT_SESSION_IMPL *session,
+ WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum)
+{
+ return (__wt_block_write(
+ session, bm->block, buf, addr, addr_sizep, data_cksum));
+}
+
+/*
+ * __bm_write_size --
+ * Return the buffer size required to write a block.
+ */
+static int
+__bm_write_size(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep)
+{
+ return (__wt_block_write_size(session, bm->block, sizep));
+}
+
+/*
+ * __bm_salvage_start --
+ * Start a block manager salvage.
+ */
+static int
+__bm_salvage_start(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_salvage_start(session, bm->block));
+}
+
+/*
+ * __bm_salvage_valid --
+ * Inform salvage a block is valid.
+ */
+static int
+__bm_salvage_valid(WT_BM *bm,
+ WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, int valid)
+{
+ return (__wt_block_salvage_valid(
+ session, bm->block, addr, addr_size, valid));
+}
+
+/*
+ * __bm_salvage_next --
+ * Return the next block from the file.
+ */
+static int
+__bm_salvage_next(WT_BM *bm,
+ WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, int *eofp)
+{
+ return (__wt_block_salvage_next(
+ session, bm->block, addr, addr_sizep, eofp));
+}
+
+/*
+ * __bm_salvage_end --
+ * End a block manager salvage.
+ */
+static int
+__bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_salvage_end(session, bm->block));
+}
+
+/*
+ * __bm_verify_start --
+ * Start a block manager verify.
+ */
+static int
+__bm_verify_start(WT_BM *bm, WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ return (__wt_block_verify_start(session, bm->block, ckptbase));
+}
+
+/*
+ * __bm_verify_addr --
+ * Verify an address.
+ */
+static int
+__bm_verify_addr(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ return (__wt_block_verify_addr(session, bm->block, addr, addr_size));
+}
+
+/*
+ * __bm_verify_end --
+ * End a block manager verify.
+ */
+static int
+__bm_verify_end(WT_BM *bm, WT_SESSION_IMPL *session)
+{
+ return (__wt_block_verify_end(session, bm->block));
+}
+
+/*
+ * __bm_method_set --
+ * Set up the legal methods.
+ */
+static void
+__bm_method_set(WT_BM *bm, int readonly)
+{
+ if (readonly) {
+ bm->addr_string = __bm_addr_string;
+ bm->addr_valid = __bm_addr_valid;
+ bm->block_header = __bm_block_header;
+ bm->checkpoint = (int (*)(WT_BM *,
+ WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int))__bm_readonly;
+ bm->checkpoint_load = __bm_checkpoint_load;
+ bm->checkpoint_resolve =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->checkpoint_unload = __bm_checkpoint_unload;
+ bm->close = __bm_close;
+ bm->compact_end =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->compact_page_skip = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+ const uint8_t *, size_t, int *))__bm_readonly;
+ bm->compact_skip = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *, int *))__bm_readonly;
+ bm->compact_start =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->free = (int (*)(WT_BM *,
+ WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly;
+ bm->preload = __wt_bm_preload;
+ bm->read = __wt_bm_read;
+ bm->salvage_end = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->salvage_next = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+ uint8_t *, size_t *, int *))__bm_readonly;
+ bm->salvage_start = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *))__bm_readonly;
+ bm->salvage_valid = (int (*)(WT_BM *,
+ WT_SESSION_IMPL *, uint8_t *, size_t, int))__bm_readonly;
+ bm->stat = __bm_stat;
+ bm->sync =
+ (int (*)(WT_BM *, WT_SESSION_IMPL *, int))__bm_readonly;
+ bm->verify_addr = __bm_verify_addr;
+ bm->verify_end = __bm_verify_end;
+ bm->verify_start = __bm_verify_start;
+ bm->write = (int (*)(WT_BM *, WT_SESSION_IMPL *,
+ WT_ITEM *, uint8_t *, size_t *, int))__bm_readonly;
+ bm->write_size = (int (*)
+ (WT_BM *, WT_SESSION_IMPL *, size_t *))__bm_readonly;
+ } else {
+ bm->addr_string = __bm_addr_string;
+ bm->addr_valid = __bm_addr_valid;
+ bm->block_header = __bm_block_header;
+ bm->checkpoint = __bm_checkpoint;
+ bm->checkpoint_load = __bm_checkpoint_load;
+ bm->checkpoint_resolve = __bm_checkpoint_resolve;
+ bm->checkpoint_unload = __bm_checkpoint_unload;
+ bm->close = __bm_close;
+ bm->compact_end = __bm_compact_end;
+ bm->compact_page_skip = __bm_compact_page_skip;
+ bm->compact_skip = __bm_compact_skip;
+ bm->compact_start = __bm_compact_start;
+ bm->free = __bm_free;
+ bm->preload = __wt_bm_preload;
+ bm->read = __wt_bm_read;
+ bm->salvage_end = __bm_salvage_end;
+ bm->salvage_next = __bm_salvage_next;
+ bm->salvage_start = __bm_salvage_start;
+ bm->salvage_valid = __bm_salvage_valid;
+ bm->stat = __bm_stat;
+ bm->sync = __bm_sync;
+ bm->verify_addr = __bm_verify_addr;
+ bm->verify_end = __bm_verify_end;
+ bm->verify_start = __bm_verify_start;
+ bm->write = __bm_write;
+ bm->write_size = __bm_write_size;
+ }
+}
+
+/*
+ * __wt_block_manager_open --
+ * Open a file.
+ */
+int
+__wt_block_manager_open(WT_SESSION_IMPL *session,
+ const char *filename, const char *cfg[],
+ int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+
+ *bmp = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &bm));
+ __bm_method_set(bm, 0);
+
+ WT_ERR(__wt_block_open(session, filename, cfg,
+ forced_salvage, readonly, allocsize, &bm->block));
+
+ *bmp = bm;
+ return (0);
+
+err: WT_TRET(bm->close(bm, session));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
new file mode 100644
index 00000000000..2fbaa0fe331
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -0,0 +1,330 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __desc_read(WT_SESSION_IMPL *, WT_BLOCK *);
+
+/*
+ * __wt_block_manager_truncate --
+ * Truncate a file.
+ */
+int
+__wt_block_manager_truncate(
+ WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+{
+ WT_DECL_RET;
+ WT_FH *fh;
+
+ /* Open the underlying file handle. */
+ WT_RET(__wt_open(session, filename, 0, 0, WT_FILE_TYPE_DATA, &fh));
+
+ /* Truncate the file. */
+ WT_ERR(__wt_ftruncate(session, fh, (wt_off_t)0));
+
+ /* Write out the file's meta-data. */
+ ret = __wt_desc_init(session, fh, allocsize);
+
+ /* Close the file handle. */
+err: WT_TRET(__wt_close(session, fh));
+
+ return (ret);
+}
+
+/*
+ * __wt_block_manager_create --
+ * Create a file.
+ */
+int
+__wt_block_manager_create(
+ WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize)
+{
+ WT_DECL_RET;
+ WT_FH *fh;
+
+ /* Create the underlying file and open a handle. */
+ WT_RET(__wt_open(session, filename, 1, 1, WT_FILE_TYPE_DATA, &fh));
+
+ /* Write out the file's meta-data. */
+ ret = __wt_desc_init(session, fh, allocsize);
+
+ /* Close the file handle. */
+ WT_TRET(__wt_close(session, fh));
+
+ /* Undo any create on error. */
+ if (ret != 0)
+ WT_TRET(__wt_remove(session, filename));
+
+ return (ret);
+}
+
+/*
+ * __block_destroy --
+ * Destroy a block handle.
+ */
+static int
+__block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ TAILQ_REMOVE(&conn->blockqh, block, q);
+
+ if (block->name != NULL)
+ __wt_free(session, block->name);
+
+ if (block->fh != NULL)
+ WT_TRET(__wt_close(session, block->fh));
+
+ __wt_spin_destroy(session, &block->live_lock);
+
+ __wt_overwrite_and_free(session, block);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_open --
+ * Open a block handle.
+ */
+int
+__wt_block_open(WT_SESSION_IMPL *session,
+ const char *filename, const char *cfg[],
+ int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp)
+{
+ WT_BLOCK *block;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename));
+
+ conn = S2C(session);
+ *blockp = NULL;
+
+ __wt_spin_lock(session, &conn->block_lock);
+ TAILQ_FOREACH(block, &conn->blockqh, q)
+ if (strcmp(filename, block->name) == 0) {
+ ++block->ref;
+ *blockp = block;
+ __wt_spin_unlock(session, &conn->block_lock);
+ return (0);
+ }
+
+ /* Basic structure allocation, initialization. */
+ WT_ERR(__wt_calloc_def(session, 1, &block));
+ block->ref = 1;
+ TAILQ_INSERT_HEAD(&conn->blockqh, block, q);
+
+ WT_ERR(__wt_strdup(session, filename, &block->name));
+ block->allocsize = allocsize;
+
+ WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval));
+ block->allocfirst =
+ WT_STRING_MATCH("first", cval.str, cval.len) ? 1 : 0;
+
+ /* Configuration: optional OS buffer cache maximum size. */
+ WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval));
+ block->os_cache_max = (size_t)cval.val;
+#ifdef HAVE_POSIX_FADVISE
+ if (conn->direct_io && block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported in combination with direct_io");
+#else
+ if (block->os_cache_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_max not supported if posix_fadvise not "
+ "available");
+#endif
+
+ /* Configuration: optional immediate write scheduling flag. */
+ WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval));
+ block->os_cache_dirty_max = (size_t)cval.val;
+#ifdef HAVE_SYNC_FILE_RANGE
+ if (conn->direct_io && block->os_cache_dirty_max)
+ WT_ERR_MSG(session, EINVAL,
+ "os_cache_dirty_max not supported in combination with "
+ "direct_io");
+#else
+ if (block->os_cache_dirty_max) {
+ /*
+ * Ignore any setting if it is not supported.
+ */
+ block->os_cache_dirty_max = 0;
+ WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+ "os_cache_dirty_max ignored when sync_file_range not "
+ "available"));
+ }
+#endif
+
+ /* Open the underlying file handle. */
+ WT_ERR(__wt_open(session, filename, 0, 0,
+ readonly ? WT_FILE_TYPE_CHECKPOINT : WT_FILE_TYPE_DATA,
+ &block->fh));
+
+ /* Initialize the live checkpoint's lock. */
+ WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager"));
+
+ /*
+ * Read the description information from the first block.
+ *
+ * Salvage is a special case: if we're forcing the salvage, we don't
+ * look at anything, including the description information.
+ */
+ if (!forced_salvage)
+ WT_ERR(__desc_read(session, block));
+
+ *blockp = block;
+ __wt_spin_unlock(session, &conn->block_lock);
+ return (0);
+
+err: WT_TRET(__block_destroy(session, block));
+ __wt_spin_unlock(session, &conn->block_lock);
+ return (ret);
+}
+
+/*
+ * __wt_block_close --
+ * Close a block handle.
+ */
+int
+__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ if (block == NULL) /* Safety check */
+ return (0);
+
+ conn = S2C(session);
+
+ WT_TRET(__wt_verbose(session, WT_VERB_BLOCK,
+ "close: %s", block->name == NULL ? "" : block->name ));
+
+ __wt_spin_lock(session, &conn->block_lock);
+
+ /* Reference count is initialized to 1. */
+ if (block->ref == 0 || --block->ref == 0)
+ WT_TRET(__block_destroy(session, block));
+
+ __wt_spin_unlock(session, &conn->block_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_desc_init --
+ * Write a file's initial descriptor structure.
+ */
+int
+__wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize)
+{
+ WT_BLOCK_DESC *desc;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ /* Use a scratch buffer to get correct alignment for direct I/O. */
+ WT_RET(__wt_scr_alloc(session, allocsize, &buf));
+ memset(buf->mem, 0, allocsize);
+
+ desc = buf->mem;
+ desc->magic = WT_BLOCK_MAGIC;
+ desc->majorv = WT_BLOCK_MAJOR_VERSION;
+ desc->minorv = WT_BLOCK_MINOR_VERSION;
+
+ /* Update the checksum. */
+ desc->cksum = 0;
+ desc->cksum = __wt_cksum(desc, allocsize);
+
+ ret = __wt_write(session, fh, (wt_off_t)0, (size_t)allocsize, desc);
+
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __desc_read --
+ * Read and verify the file's metadata.
+ */
+static int
+__desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_BLOCK_DESC *desc;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ uint32_t cksum;
+
+ /* Use a scratch buffer to get correct alignment for direct I/O. */
+ WT_RET(__wt_scr_alloc(session, block->allocsize, &buf));
+
+ /* Read the first allocation-sized block and verify the file format. */
+ WT_ERR(__wt_read(session,
+ block->fh, (wt_off_t)0, (size_t)block->allocsize, buf->mem));
+
+ desc = buf->mem;
+ WT_ERR(__wt_verbose(session, WT_VERB_BLOCK,
+ "%s: magic %" PRIu32
+ ", major/minor: %" PRIu32 "/%" PRIu32
+ ", checksum %#" PRIx32,
+ block->name, desc->magic,
+ desc->majorv, desc->minorv,
+ desc->cksum));
+
+ /*
+ * We fail the open if the checksum fails, or the magic number is wrong
+ * or the major/minor numbers are unsupported for this version. This
+ * test is done even if the caller is verifying or salvaging the file:
+ * it makes sense for verify, and for salvage we don't overwrite files
+ * without some reason to believe they are WiredTiger files. The user
+ * may have entered the wrong file name, and is now frantically pounding
+ * their interrupt key.
+ */
+ cksum = desc->cksum;
+ desc->cksum = 0;
+ if (desc->magic != WT_BLOCK_MAGIC ||
+ cksum != __wt_cksum(desc, block->allocsize))
+ WT_ERR_MSG(session, WT_ERROR,
+ "%s does not appear to be a WiredTiger file", block->name);
+
+ if (desc->majorv > WT_BLOCK_MAJOR_VERSION ||
+ (desc->majorv == WT_BLOCK_MAJOR_VERSION &&
+ desc->minorv > WT_BLOCK_MINOR_VERSION))
+ WT_ERR_MSG(session, WT_ERROR,
+ "unsupported WiredTiger file version: this build only "
+ "supports major/minor versions up to %d/%d, and the file "
+ "is version %d/%d",
+ WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION,
+ desc->majorv, desc->minorv);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_block_stat --
+ * Block statistics
+ */
+void
+__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
+{
+ /*
+ * We're looking inside the live system's structure, which normally
+ * requires locking: the chances of a corrupted read are probably
+ * non-existent, and it's statistics information regardless, but it
+ * isn't like this is a common function for an application to call.
+ */
+ __wt_spin_lock(session, &block->live_lock);
+ WT_STAT_SET(stats, allocation_size, block->allocsize);
+ WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size);
+ WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC);
+ WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION);
+ WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION);
+ WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes);
+ WT_STAT_SET(stats, block_size, block->fh->size);
+ __wt_spin_unlock(session, &block->live_lock);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_read.c b/src/third_party/wiredtiger/src/block/block_read.c
new file mode 100644
index 00000000000..c528ee4a6aa
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_read.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bm_preload --
+ * Pre-load a page.
+ */
+int
+__wt_bm_preload(WT_BM *bm,
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_BLOCK *block;
+ WT_DECL_RET;
+ wt_off_t offset;
+ uint32_t cksum, size;
+ int mapped;
+
+ WT_UNUSED(addr_size);
+ block = bm->block;
+ ret = EINVAL; /* Play games due to conditional compilation */
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /* Check for a mapped block. */
+ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+ if (mapped)
+ WT_RET(__wt_mmap_preload(
+ session, (uint8_t *)bm->map + offset, size));
+ else {
+#ifdef HAVE_POSIX_FADVISE
+ ret = posix_fadvise(block->fh->fd,
+ (wt_off_t)offset, (wt_off_t)size, POSIX_FADV_WILLNEED);
+#endif
+ if (ret != 0) {
+ WT_DECL_ITEM(tmp);
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+ ret = __wt_block_read_off(
+ session, block, tmp, offset, size, cksum);
+ __wt_scr_free(&tmp);
+ WT_RET(ret);
+ }
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, block_preload);
+
+ return (0);
+}
+
+/*
+ * __wt_bm_read --
+ * Map or read address cookie referenced block into a buffer.
+ */
+int
+__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ WT_BLOCK *block;
+ int mapped;
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+ block = bm->block;
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /*
+ * Map the block if it's possible.
+ */
+ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+ if (mapped) {
+ buf->data = (uint8_t *)bm->map + offset;
+ buf->size = size;
+ WT_RET(__wt_mmap_preload(session, buf->data, buf->size));
+
+ WT_STAT_FAST_CONN_INCR(session, block_map_read);
+ WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size);
+ return (0);
+ }
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * In diagnostic mode, verify the block we're about to read isn't on
+ * the available list, or for live systems, the discard list.
+ */
+ WT_RET(__wt_block_misplaced(
+ session, block, "read", offset, size, bm->is_live));
+#endif
+ /* Read the block. */
+ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));
+
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system's buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += size) > block->os_cache_max) {
+ WT_DECL_RET;
+
+ block->os_cache = 0;
+ /* Ignore EINVAL - some file systems don't support the flag. */
+ if ((ret = posix_fadvise(block->fh->fd,
+ (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0 &&
+ ret != EINVAL)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_block_read_off_blind --
+ * Read the block at an offset, try to figure out what it looks like,
+ * debugging only.
+ */
+int
+__wt_block_read_off_blind(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset)
+{
+ WT_BLOCK_HEADER *blk;
+ uint32_t cksum, size;
+
+ /*
+ * Make sure the buffer is large enough for the header and read the
+ * the first allocation-size block.
+ */
+ WT_RET(__wt_buf_init(session, buf, block->allocsize));
+ WT_RET(__wt_read(
+ session, block->fh, offset, (size_t)block->allocsize, buf->mem));
+ blk = WT_BLOCK_HEADER_REF(buf->mem);
+
+ /*
+ * Copy out the size and checksum (we're about to re-use the buffer),
+ * and if the size isn't insane, read the rest of the block.
+ */
+ size = blk->disk_size;
+ cksum = blk->cksum;
+ if (__wt_block_offset_invalid(block, offset, size))
+ WT_RET_MSG(session, EINVAL,
+ "block at offset %" PRIuMAX " cannot be a valid block, no "
+ "read attempted",
+ (uintmax_t)offset);
+ return (__wt_block_read_off(session, block, buf, offset, size, cksum));
+}
+#endif
+
+/*
+ * __wt_block_read_off --
+ * Read an addr/size pair referenced block into a buffer.
+ */
+int
+__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum)
+{
+ WT_BLOCK_HEADER *blk;
+ size_t bufsize;
+ uint32_t page_cksum;
+
+ WT_RET(__wt_verbose(session, WT_VERB_READ,
+ "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
+ (uintmax_t)offset, size, cksum));
+
+ /*
+ * Grow the buffer as necessary and read the block. Buffers should be
+ * aligned for reading, but there are lots of buffers (for example, file
+ * cursors have two buffers each, key and value), and it's difficult to
+ * be sure we've found all of them. If the buffer isn't aligned, it's
+ * an easy fix: set the flag and guarantee we reallocate it. (Most of
+ * the time on reads, the buffer memory has not yet been allocated, so
+ * we're not adding any additional processing time.)
+ */
+ if (F_ISSET(buf, WT_ITEM_ALIGNED))
+ bufsize = size;
+ else {
+ F_SET(buf, WT_ITEM_ALIGNED);
+ bufsize = WT_MAX(size, buf->memsize + 10);
+ }
+ WT_RET(__wt_buf_init(session, buf, bufsize));
+ WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
+ buf->size = size;
+
+ blk = WT_BLOCK_HEADER_REF(buf->mem);
+ blk->cksum = 0;
+ page_cksum = __wt_cksum(buf->mem,
+ F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
+ if (cksum != page_cksum) {
+ if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ __wt_errx(session,
+ "read checksum error [%"
+ PRIu32 "B @ %" PRIuMAX ", %"
+ PRIu32 " != %" PRIu32 "]",
+ size, (uintmax_t)offset, cksum, page_cksum);
+
+ /* Panic if a checksum fails during an ordinary read. */
+ return (block->verify ||
+ F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+ WT_ERROR :
+ __wt_illegal_value(session, block->name));
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, block_read);
+ WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_session.c b/src/third_party/wiredtiger/src/block/block_session.c
new file mode 100644
index 00000000000..fa56b72f49b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_session.c
@@ -0,0 +1,305 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Per session handle cached block manager information.
+ */
+typedef struct {
+ WT_EXT *ext_cache; /* List of WT_EXT handles */
+ u_int ext_cache_cnt; /* Count */
+
+ WT_SIZE *sz_cache; /* List of WT_SIZE handles */
+ u_int sz_cache_cnt; /* Count */
+} WT_BLOCK_MGR_SESSION;
+
+/*
+ * __block_ext_alloc --
+ * Allocate a new WT_EXT structure.
+ */
+static int
+__block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
+{
+ WT_EXT *ext;
+
+ u_int skipdepth;
+
+ skipdepth = __wt_skip_choose_depth(session);
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
+ ext->depth = (uint8_t)skipdepth;
+ (*extp) = ext;
+
+ return (0);
+}
+
+/*
+ * __wt_block_ext_alloc --
+ * Return a WT_EXT structure for use.
+ */
+int
+__wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
+{
+ WT_EXT *ext;
+ WT_BLOCK_MGR_SESSION *bms;
+ u_int i;
+
+ bms = session->block_manager;
+
+ /* Return a WT_EXT structure for use from a cached list. */
+ if (bms != NULL && bms->ext_cache != NULL) {
+ ext = bms->ext_cache;
+ bms->ext_cache = ext->next[0];
+
+ /* Clear any left-over references. */
+ for (i = 0; i < ext->depth; ++i)
+ ext->next[i] = ext->next[i + ext->depth] = NULL;
+
+ /*
+ * The count is advisory to minimize our exposure to bugs, but
+ * don't let it go negative.
+ */
+ if (bms->ext_cache_cnt > 0)
+ --bms->ext_cache_cnt;
+
+ *extp = ext;
+ return (0);
+ }
+
+ return (__block_ext_alloc(session, extp));
+}
+
+/*
+ * __block_ext_prealloc --
+ * Pre-allocate WT_EXT structures.
+ */
+static int
+__block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_EXT *ext;
+
+ bms = session->block_manager;
+
+ for (; bms->ext_cache_cnt < max; ++bms->ext_cache_cnt) {
+ WT_RET(__block_ext_alloc(session, &ext));
+
+ ext->next[0] = bms->ext_cache;
+ bms->ext_cache = ext;
+ }
+ return (0);
+}
+
+/*
+ * __wt_block_ext_free --
+ * Add a WT_EXT structure to the cached list.
+ */
+void
+__wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+
+ if ((bms = session->block_manager) == NULL)
+ __wt_free(session, ext);
+ else {
+ ext->next[0] = bms->ext_cache;
+ bms->ext_cache = ext;
+
+ ++bms->ext_cache_cnt;
+ }
+}
+
+/*
+ * __block_ext_discard --
+ * Discard some or all of the WT_EXT structure cache.
+ */
+static int
+__block_ext_discard(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_EXT *ext, *next;
+
+ bms = session->block_manager;
+ if (max != 0 && bms->ext_cache_cnt <= max)
+ return (0);
+
+ for (ext = bms->ext_cache; ext != NULL;) {
+ next = ext->next[0];
+ __wt_free(session, ext);
+ ext = next;
+
+ --bms->ext_cache_cnt;
+ if (max != 0 && bms->ext_cache_cnt <= max)
+ break;
+ }
+ bms->ext_cache = ext;
+
+ if (max == 0 && bms->ext_cache_cnt != 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "incorrect count in session handle's block manager cache");
+ return (0);
+}
+
+/*
+ * __block_size_alloc --
+ * Allocate a new WT_SIZE structure.
+ */
+static int
+__block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
+{
+ return (__wt_calloc(session, 1, sizeof(WT_SIZE), szp));
+}
+
+/*
+ * __wt_block_size_alloc --
+ * Return a WT_SIZE structure for use.
+ */
+int
+__wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+
+ bms = session->block_manager;
+
+ /* Return a WT_SIZE structure for use from a cached list. */
+ if (bms != NULL && bms->sz_cache != NULL) {
+ (*szp) = bms->sz_cache;
+ bms->sz_cache = bms->sz_cache->next[0];
+
+ /*
+ * The count is advisory to minimize our exposure to bugs, but
+ * don't let it go negative.
+ */
+ if (bms->sz_cache_cnt > 0)
+ --bms->sz_cache_cnt;
+ return (0);
+ }
+
+ return (__block_size_alloc(session, szp));
+}
+
+/*
+ * __block_size_prealloc --
+ * Pre-allocate WT_SIZE structures.
+ */
+static int
+__block_size_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_SIZE *sz;
+
+ bms = session->block_manager;
+
+ for (; bms->sz_cache_cnt < max; ++bms->sz_cache_cnt) {
+ WT_RET(__block_size_alloc(session, &sz));
+
+ sz->next[0] = bms->sz_cache;
+ bms->sz_cache = sz;
+ }
+ return (0);
+}
+
+/*
+ * __wt_block_size_free --
+ * Add a WT_SIZE structure to the cached list.
+ */
+void
+__wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+
+ if ((bms = session->block_manager) == NULL)
+ __wt_free(session, sz);
+ else {
+ sz->next[0] = bms->sz_cache;
+ bms->sz_cache = sz;
+
+ ++bms->sz_cache_cnt;
+ }
+}
+
+/*
+ * __block_size_discard --
+ * Discard some or all of the WT_SIZE structure cache.
+ */
+static int
+__block_size_discard(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_BLOCK_MGR_SESSION *bms;
+ WT_SIZE *sz, *nsz;
+
+ bms = session->block_manager;
+ if (max != 0 && bms->sz_cache_cnt <= max)
+ return (0);
+
+ for (sz = bms->sz_cache; sz != NULL;) {
+ nsz = sz->next[0];
+ __wt_free(session, sz);
+ sz = nsz;
+
+ --bms->sz_cache_cnt;
+ if (max != 0 && bms->sz_cache_cnt <= max)
+ break;
+ }
+ bms->sz_cache = sz;
+
+ if (max == 0 && bms->sz_cache_cnt != 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "incorrect count in session handle's block manager cache");
+ return (0);
+}
+
+/*
+ * __block_manager_session_cleanup --
+ * Clean up the session handle's block manager information.
+ */
+static int
+__block_manager_session_cleanup(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ if (session->block_manager == NULL)
+ return (0);
+
+ WT_TRET(__block_ext_discard(session, 0));
+ WT_TRET(__block_size_discard(session, 0));
+
+ __wt_free(session, session->block_manager);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_ext_prealloc --
+ * Pre-allocate WT_EXT and WT_SIZE structures.
+ */
+int
+__wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
+{
+ if (session->block_manager == NULL) {
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_BLOCK_MGR_SESSION), &session->block_manager));
+ session->block_manager_cleanup =
+ __block_manager_session_cleanup;
+ }
+ WT_RET(__block_ext_prealloc(session, max));
+ WT_RET(__block_size_prealloc(session, max));
+ return (0);
+}
+
+/*
+ * __wt_block_ext_discard --
+ * Discard WT_EXT and WT_SIZE structures after checkpoint runs.
+ */
+int
+__wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max)
+{
+ WT_RET(__block_ext_discard(session, max));
+ WT_RET(__block_size_discard(session, max));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_slvg.c b/src/third_party/wiredtiger/src/block/block_slvg.c
new file mode 100644
index 00000000000..349daa620f5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_slvg.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_salvage_start --
+ * Start a file salvage.
+ */
+int
+__wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ wt_off_t len;
+ uint32_t allocsize;
+
+ allocsize = block->allocsize;
+
+ /* Reset the description information in the first block. */
+ WT_RET(__wt_desc_init(session, block->fh, allocsize));
+
+ /*
+ * Salvage creates a new checkpoint when it's finished, set up for
+ * rolling an empty file forward.
+ */
+ WT_RET(__wt_block_ckpt_init(session, &block->live, "live"));
+
+ /*
+ * Truncate the file to an allocation-size multiple of blocks (bytes
+ * trailing the last block must be garbage, by definition).
+ */
+ if (block->fh->size > allocsize) {
+ len = (block->fh->size / allocsize) * allocsize;
+ if (len != block->fh->size)
+ WT_RET(__wt_ftruncate(session, block->fh, len));
+ } else
+ len = allocsize;
+ block->live.file_size = len;
+
+ /*
+ * The file's first allocation-sized block is description information,
+ * skip it when reading through the file.
+ */
+ block->slvg_off = allocsize;
+
+ /*
+ * The only checkpoint extent we care about is the allocation list.
+ * Start with the entire file on the allocation list, we'll "free"
+ * any blocks we don't want as we process the file.
+ */
+ WT_RET(__wt_block_insert_ext(
+ session, &block->live.alloc, allocsize, len - allocsize));
+
+ return (0);
+}
+
+/*
+ * __wt_block_salvage_end --
+ * End a file salvage.
+ */
+int
+__wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ /* Discard the checkpoint. */
+ return (__wt_block_checkpoint_unload(session, block, 0));
+}
+
+/*
+ * __wt_block_offset_invalid --
+ * Return if the block offset is insane.
+ */
+int
+__wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size)
+{
+ if (size == 0) /* < minimum page size */
+ return (1);
+ if (size % block->allocsize != 0) /* not allocation-size units */
+ return (1);
+ if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */
+ return (1);
+ /* past end-of-file */
+ if (offset + (wt_off_t)size > block->fh->size)
+ return (1);
+ return (0);
+}
+
+/*
+ * __wt_block_salvage_next --
+ * Return the address for the next potential block from the file.
+ */
+int
+__wt_block_salvage_next(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp)
+{
+ WT_BLOCK_HEADER *blk;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_FH *fh;
+ wt_off_t max, offset;
+ uint32_t allocsize, cksum, size;
+ uint8_t *endp;
+
+ *eofp = 0;
+
+ fh = block->fh;
+ allocsize = block->allocsize;
+ WT_ERR(__wt_scr_alloc(session, allocsize, &tmp));
+
+ /* Read through the file, looking for pages. */
+ for (max = fh->size;;) {
+ offset = block->slvg_off;
+ if (offset >= max) { /* Check eof. */
+ *eofp = 1;
+ goto done;
+ }
+
+ /*
+ * Read the start of a possible page (an allocation-size block),
+ * and get a page length from it. Move to the next allocation
+ * sized boundary, we'll never consider this one again.
+ */
+ WT_ERR(__wt_read(
+ session, fh, offset, (size_t)allocsize, tmp->mem));
+ blk = WT_BLOCK_HEADER_REF(tmp->mem);
+ size = blk->disk_size;
+ cksum = blk->cksum;
+
+ /*
+ * Check the block size: if it's not insane, read the block.
+ * Reading the block validates any checksum; if reading the
+ * block succeeds, return its address as a possible page,
+ * otherwise, move past it.
+ */
+ if (!__wt_block_offset_invalid(block, offset, size) &&
+ __wt_block_read_off(
+ session, block, tmp, offset, size, cksum) == 0)
+ break;
+
+ /* Free the allocation-size block. */
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "skipping %" PRIu32 "B at file offset %" PRIuMAX,
+ allocsize, (uintmax_t)offset));
+ WT_ERR(__wt_block_off_free(
+ session, block, offset, (wt_off_t)allocsize));
+ block->slvg_off += allocsize;
+ }
+
+ /* Re-create the address cookie that should reference this block. */
+ endp = addr;
+ WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+ *addr_sizep = WT_PTRDIFF(endp, addr);
+
+done:
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_block_salvage_valid --
+ * Let salvage know if a block is valid.
+ */
+int
+__wt_block_salvage_valid(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid)
+{
+ wt_off_t offset;
+ uint32_t size, cksum;
+
+ WT_UNUSED(session);
+ WT_UNUSED(addr_size);
+
+ /*
+ * Crack the cookie.
+ * If the upper layer took the block, move past it; if the upper layer
+ * rejected the block, move past an allocation size chunk and free it.
+ */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+ if (valid)
+ block->slvg_off = offset + size;
+ else {
+ WT_RET(__wt_block_off_free(
+ session, block, offset, (wt_off_t)block->allocsize));
+ block->slvg_off = offset + block->allocsize;
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c
new file mode 100644
index 00000000000..148b4fa9743
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_vrfy.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __verify_ckptfrag_add(
+ WT_SESSION_IMPL *, WT_BLOCK *, wt_off_t, wt_off_t);
+static int __verify_ckptfrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_filefrag_add(
+ WT_SESSION_IMPL *, WT_BLOCK *, const char *, wt_off_t, wt_off_t, int);
+static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
+static int __verify_last_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+
+/* The bit list ignores the first block: convert to/from a frag/offset. */
+#define WT_wt_off_tO_FRAG(block, off) \
+ ((off) / (block)->allocsize - 1)
+#define WT_FRAG_TO_OFF(block, frag) \
+ (((wt_off_t)(frag + 1)) * (block)->allocsize)
+
+/*
+ * __wt_block_verify_start --
+ * Start file verification.
+ */
+int
+__wt_block_verify_start(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt;
+ wt_off_t size;
+
+ /*
+ * Find the last checkpoint in the list: if there are none, or the only
+ * checkpoint we have is fake, there's no work to do. Don't complain,
+ * that's not our problem to solve.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ ;
+ for (;; --ckpt) {
+ if (ckpt->name != NULL && !F_ISSET(ckpt, WT_CKPT_FAKE))
+ break;
+ if (ckpt == ckptbase)
+ return (0);
+ }
+
+ /* Truncate the file to the size of the last checkpoint. */
+ WT_RET(__verify_last_truncate(session, block, ckpt));
+
+ /*
+ * We're done if the file has no data pages (this happens if we verify
+ * a file immediately after creation or the checkpoint doesn't reflect
+ * any of the data pages).
+ */
+ size = block->fh->size;
+ if (size <= block->allocsize)
+ return (0);
+
+ /* The file size should be a multiple of the allocation size. */
+ if (size % block->allocsize != 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "the file size is not a multiple of the allocation size");
+
+ /*
+ * Allocate a bit array, where each bit represents a single allocation
+ * size piece of the file (this is how we track the parts of the file
+ * we've verified, and check for multiply referenced or unreferenced
+ * blocks). Storing this on the heap seems reasonable, verifying a 1TB
+ * file with an 512B allocation size would require a 256MB bit array:
+ *
+ * (((1 * 2^40) / 512) / 8) = 256 * 2^20
+ *
+ * To verify larger files than we can handle in this way, we'd have to
+ * write parts of the bit array into a disk file.
+ *
+ * Alternatively, we could switch to maintaining ranges of the file as
+ * we do with the extents, but that has its own failure mode, where we
+ * verify many non-contiguous blocks creating too many entries on the
+ * list to fit into memory.
+ */
+ block->frags = (uint64_t)WT_wt_off_tO_FRAG(block, size);
+ WT_RET(__bit_alloc(session, block->frags, &block->fragfile));
+
+ /*
+ * We maintain an allocation list that is rolled forward through the
+ * set of checkpoints.
+ */
+ WT_RET(__wt_block_extlist_init(
+ session, &block->verify_alloc, "verify", "alloc", 0));
+
+ /*
+ * The only checkpoint avail list we care about is the last one written;
+ * get it now and initialize the list of file fragments.
+ */
+ WT_RET(__verify_last_avail(session, block, ckpt));
+
+ block->verify = 1;
+ return (0);
+}
+
+/*
+ * __verify_last_avail --
+ * Get the last checkpoint's avail list and load it into the list of file
+ * fragments.
+ */
+static int
+__verify_last_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_RET;
+ WT_EXT *ext;
+ WT_EXTLIST *el;
+
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+
+ el = &ci->avail;
+ if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_ERR(__wt_block_extlist_read_avail(
+ session, block, el, ci->file_size));
+ WT_EXT_FOREACH(ext, el->off)
+ if ((ret = __verify_filefrag_add(session, block,
+ "avail-list chunk", ext->off, ext->size, 1)) != 0)
+ break;
+ }
+
+err: __wt_block_ckpt_destroy(session, ci);
+ return (ret);
+}
+
+/*
+ * __verify_last_truncate --
+ * Truncate the file to the last checkpoint's size.
+ */
+static int
+__verify_last_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+{
+ WT_BLOCK_CKPT *ci, _ci;
+ WT_DECL_RET;
+
+ ci = &_ci;
+ WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
+ WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
+ WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
+
+err: __wt_block_ckpt_destroy(session, ci);
+ return (ret);
+}
+
+/*
+ * __wt_block_verify_end --
+ * End file verification.
+ */
+int
+__wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_DECL_RET;
+
+ /* Confirm we verified every file block. */
+ ret = __verify_filefrag_chk(session, block);
+
+ /* Discard the accumulated allocation list. */
+ __wt_block_extlist_free(session, &block->verify_alloc);
+
+ /* Discard the fragment tracking lists. */
+ __wt_free(session, block->fragfile);
+ __wt_free(session, block->fragckpt);
+
+ block->verify = 0;
+ return (ret);
+}
+
+/*
+ * __wt_verify_ckpt_load --
+ * Verify work done when a checkpoint is loaded.
+ */
+int
+__wt_verify_ckpt_load(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci)
+{
+ WT_EXTLIST *el;
+ WT_EXT *ext;
+ uint64_t frag, frags;
+
+ /* Set the maximum file size for this checkpoint. */
+ block->verify_size = ci->file_size;
+
+ /*
+ * Add the root page and disk blocks used to store the extent lists to
+ * the list of blocks we've "seen" from the file.
+ */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "checkpoint",
+ ci->root_offset, (wt_off_t)ci->root_size, 1));
+ if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "alloc list",
+ ci->alloc.offset, (wt_off_t)ci->alloc.size, 1));
+ if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "avail list",
+ ci->avail.offset, (wt_off_t)ci->avail.size, 1));
+ if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__verify_filefrag_add(session, block, "discard list",
+ ci->discard.offset, (wt_off_t)ci->discard.size, 1));
+
+ /*
+ * Checkpoint verification is similar to deleting checkpoints. As we
+ * read each new checkpoint, we merge the allocation lists (accumulating
+ * all allocated pages as we move through the system), and then remove
+ * any pages found in the discard list. The result should be a
+ * one-to-one mapping to the pages we find in this specific checkpoint.
+ */
+ el = &ci->alloc;
+ if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_RET(__wt_block_extlist_read(
+ session, block, el, ci->file_size));
+ WT_RET(__wt_block_extlist_merge(
+ session, el, &block->verify_alloc));
+ __wt_block_extlist_free(session, el);
+ }
+ el = &ci->discard;
+ if (el->offset != WT_BLOCK_INVALID_OFFSET) {
+ WT_RET(__wt_block_extlist_read(
+ session, block, el, ci->file_size));
+ WT_EXT_FOREACH(ext, el->off)
+ WT_RET(__wt_block_off_remove_overlap(session,
+ &block->verify_alloc, ext->off, ext->size));
+ __wt_block_extlist_free(session, el);
+ }
+
+ /*
+ * The root page of the checkpoint appears on the alloc list, but not,
+ * at least until the checkpoint is deleted, on a discard list. To
+ * handle this case, remove the root page from the accumulated list of
+ * checkpoint pages, so it doesn't add a new requirement for subsequent
+ * checkpoints.
+ */
+ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET)
+ WT_RET(__wt_block_off_remove_overlap(session,
+ &block->verify_alloc, ci->root_offset, ci->root_size));
+
+ /*
+ * Allocate the per-checkpoint bit map. The per-checkpoint bit map is
+ * the opposite of the per-file bit map, that is, we set all the bits
+ * that we expect to be set based on the checkpoint's allocation and
+ * discard lists, then clear bits as we verify blocks. When finished
+ * verifying the checkpoint, the bit list should be empty.
+ */
+ WT_RET(__bit_alloc(session, block->frags, &block->fragckpt));
+ el = &block->verify_alloc;
+ WT_EXT_FOREACH(ext, el->off) {
+ frag = (uint64_t)WT_wt_off_tO_FRAG(block, ext->off);
+ frags = (uint64_t)(ext->size / block->allocsize);
+ __bit_nset(block->fragckpt, frag, frag + (frags - 1));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_verify_ckpt_unload --
+ * Verify work done when a checkpoint is unloaded.
+ */
+int
+__wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ WT_DECL_RET;
+
+ /* Confirm we verified every checkpoint block. */
+ ret = __verify_ckptfrag_chk(session, block);
+
+ /* Discard the per-checkpoint fragment list. */
+ __wt_free(session, block->fragckpt);
+
+ return (ret);
+}
+
+/*
+ * __wt_block_verify_addr --
+ * Update an address in a checkpoint as verified.
+ */
+int
+__wt_block_verify_addr(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, const uint8_t *addr, size_t addr_size)
+{
+ wt_off_t offset;
+ uint32_t cksum, size;
+
+ WT_UNUSED(addr_size);
+
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
+
+ /* Add to the per-file list. */
+ WT_RET(__verify_filefrag_add(session, block, NULL, offset, size, 0));
+
+ /*
+ * It's tempting to try and flag a page as "verified" when we read it.
+ * That doesn't work because we may visit a page multiple times when
+ * verifying a single checkpoint (for example, when verifying the
+ * physical image of a row-store leaf page with overflow keys, the
+ * overflow keys are read when checking for key sort issues, and read
+ * again when more general overflow item checking is done). This
+ * function is called by the btree verification code, once per logical
+ * visit in a checkpoint, so we can detect if a page is referenced
+ * multiple times within a single checkpoint. This doesn't apply to
+ * the per-file list, because it is expected for the same btree blocks
+ * to appear in multiple checkpoints.
+ *
+ * Add the block to the per-checkpoint list.
+ */
+ WT_RET(__verify_ckptfrag_add(session, block, offset, size));
+
+ return (0);
+}
+
+/*
+ * __verify_filefrag_add --
+ * Add the fragments to the per-file fragment list, optionally complain if
+ * we've already verified this chunk of the file.
+ */
+static int
+__verify_filefrag_add(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ const char *type, wt_off_t offset, wt_off_t size, int nodup)
+{
+ uint64_t f, frag, frags, i;
+
+ WT_RET(__wt_verbose(session, WT_VERB_VERIFY,
+ "add file block%s%s%s at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+ type == NULL ? "" : " (",
+ type == NULL ? "" : type,
+ type == NULL ? "" : ")",
+ (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size));
+
+ /* Check each chunk against the total file size. */
+ if (offset + size > block->fh->size)
+ WT_RET_MSG(session, WT_ERROR,
+ "fragment %" PRIuMAX "-%" PRIuMAX " references "
+ "non-existent file blocks",
+ (uintmax_t)offset, (uintmax_t)(offset + size));
+
+ frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset);
+ frags = (uint64_t)(size / block->allocsize);
+
+ /* It may be illegal to reference a particular chunk more than once. */
+ if (nodup)
+ for (f = frag, i = 0; i < frags; ++f, ++i)
+ if (__bit_test(block->fragfile, f))
+ WT_RET_MSG(session, WT_ERROR,
+ "file fragment at %" PRIuMAX " referenced "
+ "multiple times",
+ (uintmax_t)offset);
+
+ /* Add fragments to the file's fragment list. */
+ __bit_nset(block->fragfile, frag, frag + (frags - 1));
+
+ return (0);
+}
+
+/*
+ * __verify_filefrag_chk --
+ * Verify we've checked all the fragments in the file.
+ */
+static int
+__verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ uint64_t count, first, last;
+
+ /* If there's nothing to verify, it was a fast run. */
+ if (block->frags == 0)
+ return (0);
+
+ /*
+ * It's OK if we have not verified blocks at the end of the file: that
+ * happens if the file is truncated during a checkpoint or load or was
+ * extended after writing a checkpoint. We should never see unverified
+ * blocks anywhere else, though.
+ *
+ * I'm deliberately testing for a last fragment of 0, it makes no sense
+ * there would be no fragments verified, complain if the first fragment
+ * in the file wasn't verified.
+ */
+ for (last = block->frags - 1; last != 0; --last) {
+ if (__bit_test(block->fragfile, last))
+ break;
+ __bit_set(block->fragfile, last);
+ }
+
+ /*
+ * Check for any other file fragments we haven't verified -- every time
+ * we find a bit that's clear, complain. We re-start the search each
+ * time after setting the clear bit(s) we found: it's simpler and this
+ * isn't supposed to happen a lot.
+ */
+ for (count = 0;; ++count) {
+ if (__bit_ffc(block->fragfile, block->frags, &first) != 0)
+ break;
+ __bit_set(block->fragfile, first);
+ for (last = first + 1; last < block->frags; ++last) {
+ if (__bit_test(block->fragfile, last))
+ break;
+ __bit_set(block->fragfile, last);
+ }
+
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY))
+ continue;
+
+ __wt_errx(session,
+ "file range %" PRIuMAX "-%" PRIuMAX " never verified",
+ (uintmax_t)WT_FRAG_TO_OFF(block, first),
+ (uintmax_t)WT_FRAG_TO_OFF(block, last));
+ }
+ if (count == 0)
+ return (0);
+
+ __wt_errx(session, "file ranges never verified: %" PRIu64, count);
+ return (WT_ERROR);
+}
+
+/*
+ * __verify_ckptfrag_add --
+ * Clear the fragments in the per-checkpoint fragment list, and complain if
+ * we've already verified this chunk of the checkpoint.
+ */
+static int
+__verify_ckptfrag_add(
+ WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size)
+{
+ uint64_t f, frag, frags, i;
+
+ WT_RET(__wt_verbose(session, WT_VERB_VERIFY,
+ "add checkpoint block at %" PRIuMAX "-%" PRIuMAX " (%" PRIuMAX ")",
+ (uintmax_t)offset, (uintmax_t)(offset + size), (uintmax_t)size));
+
+ /*
+ * Check each chunk against the checkpoint's size, a checkpoint should
+ * never reference a block outside of the checkpoint's stored size.
+ */
+ if (offset + size > block->verify_size)
+ WT_RET_MSG(session, WT_ERROR,
+ "fragment %" PRIuMAX "-%" PRIuMAX " references "
+ "file blocks outside the checkpoint",
+ (uintmax_t)offset, (uintmax_t)(offset + size));
+
+ frag = (uint64_t)WT_wt_off_tO_FRAG(block, offset);
+ frags = (uint64_t)(size / block->allocsize);
+
+ /* It is illegal to reference a particular chunk more than once. */
+ for (f = frag, i = 0; i < frags; ++f, ++i)
+ if (!__bit_test(block->fragckpt, f))
+ WT_RET_MSG(session, WT_ERROR,
+ "fragment at %" PRIuMAX " referenced multiple "
+ "times in a single checkpoint or found in the "
+ "checkpoint but not listed in the checkpoint's "
+ "allocation list",
+ (uintmax_t)offset);
+
+ /* Remove fragments from the checkpoint's allocation list. */
+ __bit_nclr(block->fragckpt, frag, frag + (frags - 1));
+
+ return (0);
+}
+
+/*
+ * __verify_ckptfrag_chk --
+ * Verify we've checked all the fragments in the checkpoint.
+ */
+static int
+__verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
+{
+ uint64_t count, first, last;
+
+ /*
+ * The checkpoint fragment memory is only allocated as a checkpoint
+ * is successfully loaded; don't check if there's nothing there.
+ */
+ if (block->fragckpt == NULL)
+ return (0);
+
+ /*
+ * Check for checkpoint fragments we haven't verified -- every time we
+ * find a bit that's set, complain. We re-start the search each time
+ * after clearing the set bit(s) we found: it's simpler and this isn't
+ * supposed to happen a lot.
+ */
+ for (count = 0;; ++count) {
+ if (__bit_ffs(block->fragckpt, block->frags, &first) != 0)
+ break;
+ __bit_clear(block->fragckpt, first);
+ for (last = first + 1; last < block->frags; ++last) {
+ if (!__bit_test(block->fragckpt, last))
+ break;
+ __bit_clear(block->fragckpt, last);
+ }
+
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_VERIFY))
+ continue;
+
+ __wt_errx(session,
+ "checkpoint range %" PRIuMAX "-%" PRIuMAX " never verified",
+ (uintmax_t)WT_FRAG_TO_OFF(block, first),
+ (uintmax_t)WT_FRAG_TO_OFF(block, last));
+ }
+
+ if (count == 0)
+ return (0);
+
+ __wt_errx(session,
+ "checkpoint ranges never verified: %" PRIu64, count);
+ return (WT_ERROR);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
new file mode 100644
index 00000000000..0da6380e61f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -0,0 +1,269 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_block_header --
+ * Return the size of the block-specific header.
+ */
+u_int
+__wt_block_header(WT_BLOCK *block)
+{
+ WT_UNUSED(block);
+
+ return ((u_int)WT_BLOCK_HEADER_SIZE);
+}
+
+/*
+ * __wt_block_write_size --
+ * Return the buffer size required to write a block.
+ */
+int
+__wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep)
+{
+ WT_UNUSED(session);
+
+ /*
+ * We write the page size, in bytes, into the block's header as a 4B
+ * unsigned value, and it's possible for the engine to accept an item
+ * we can't write. For example, a huge key/value where the allocation
+ * size has been set to something large will overflow 4B when it tries
+ * to align the write. We could make this work (for example, writing
+ * the page size in units of allocation size or something else), but
+ * it's not worth the effort, writing 4GB objects into a btree makes
+ * no sense. Limit the writes to (4GB - 1KB), it gives us potential
+ * mode bits, and I'm not interested in debugging corner cases anyway.
+ */
+ *sizep = (size_t)
+ WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize);
+ return (*sizep > UINT32_MAX - 1024 ? EINVAL : 0);
+}
+
+/*
+ * __wt_block_write --
+ * Write a buffer into a block, returning the block's address cookie.
+ */
+int
+__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum)
+{
+ wt_off_t offset;
+ uint32_t size, cksum;
+ uint8_t *endp;
+
+ WT_RET(__wt_block_write_off(
+ session, block, buf, &offset, &size, &cksum, data_cksum, 0));
+
+ endp = addr;
+ WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+ *addr_sizep = WT_PTRDIFF(endp, addr);
+
+ return (0);
+}
+
+/*
+ * __wt_block_write_off --
+ * Write a buffer into a block, returning the block's offset, size and
+ * checksum.
+ */
+int
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
+ int data_cksum, int caller_locked)
+{
+ WT_BLOCK_HEADER *blk;
+ WT_DECL_RET;
+ WT_FH *fh;
+ size_t align_size;
+ wt_off_t offset;
+ int local_locked;
+
+ blk = WT_BLOCK_HEADER_REF(buf->mem);
+ fh = block->fh;
+ local_locked = 0;
+
+ /* Buffers should be aligned for writing. */
+ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
+ WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
+ WT_RET_MSG(session, EINVAL,
+ "direct I/O check: write buffer incorrectly allocated");
+ }
+
+ /*
+ * Align the size to an allocation unit.
+ *
+ * The buffer must be big enough for us to zero to the next allocsize
+ * boundary, this is one of the reasons the btree layer must find out
+ * from the block-manager layer the maximum size of the eventual write.
+ */
+ align_size = WT_ALIGN(buf->size, block->allocsize);
+ if (align_size > buf->memsize) {
+ WT_ASSERT(session, align_size <= buf->memsize);
+ WT_RET_MSG(session, EINVAL,
+ "buffer size check: write buffer incorrectly allocated");
+ }
+ if (align_size > UINT32_MAX) {
+ WT_ASSERT(session, align_size <= UINT32_MAX);
+ WT_RET_MSG(session, EINVAL,
+ "buffer size check: write buffer too large to write");
+ }
+
+ /* Zero out any unused bytes at the end of the buffer. */
+ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);
+
+ /*
+ * Set the disk size so we don't have to incrementally read blocks
+ * during salvage.
+ */
+ blk->disk_size = WT_STORE_SIZE(align_size);
+
+ /*
+ * Update the block's checksum: if our caller specifies, checksum the
+ * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
+ * bytes. The assumption is applications with good compression support
+ * turn off checksums and assume corrupted blocks won't decompress
+ * correctly. However, if compression failed to shrink the block, the
+ * block wasn't compressed, in which case our caller will tell us to
+ * checksum the data to detect corruption. If compression succeeded,
+ * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
+ * because they're not compressed, both to give salvage a quick test
+ * of whether a block is useful and to give us a test so we don't lose
+ * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
+ */
+ blk->flags = 0;
+ if (data_cksum)
+ F_SET(blk, WT_BLOCK_DATA_CKSUM);
+ blk->cksum = 0;
+ blk->cksum = __wt_cksum(
+ buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
+
+ if (!caller_locked) {
+ WT_RET(__wt_block_ext_prealloc(session, 5));
+ __wt_spin_lock(session, &block->live_lock);
+ local_locked = 1;
+ }
+ ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
+
+ /*
+ * Extend the file in chunks. We want to limit the number of threads
+ * extending the file at the same time, so choose the one thread that's
+ * crossing the extended boundary. We don't extend newly created files,
+ * and it's theoretically possible we might wait so long our extension
+ * of the file is passed by another thread writing single blocks, that's
+ * why there's a check in case the extended file size becomes too small:
+ * if the file size catches up, every thread tries to extend it.
+ *
+ * File extension may require locking: some variants of the system call
+ * used to extend the file initialize the extended space. If a writing
+ * thread races with the extending thread, the extending thread might
+ * overwrite already written data, and that would be very, very bad.
+ *
+ * Some variants of the system call to extend the file fail at run-time
+ * based on the filesystem type, fall back to ftruncate in that case,
+ * and remember that ftruncate requires locking.
+ */
+ if (ret == 0 &&
+ fh->extend_len != 0 &&
+ (fh->extend_size <= fh->size ||
+ (offset + fh->extend_len <= fh->extend_size &&
+ offset +
+ fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) {
+ fh->extend_size = offset + fh->extend_len * 2;
+ if (fh->fallocate_available) {
+ /*
+ * Release any locally acquired lock if it's not needed
+ * to extend the file, extending the file might require
+ * updating file metadata, which can be slow. (It may be
+ * a bad idea to configure for file extension on systems
+ * that require locking over the extend call.)
+ */
+ if (!fh->fallocate_requires_locking && local_locked) {
+ __wt_spin_unlock(session, &block->live_lock);
+ local_locked = 0;
+ }
+
+ /* Extend the file. */
+ if ((ret = __wt_fallocate(session,
+ fh, offset, fh->extend_len * 2)) == ENOTSUP) {
+ ret = 0;
+ goto extend_truncate;
+ }
+ } else {
+extend_truncate: /*
+ * We may have a caller lock or a locally acquired lock,
+ * but we need a lock to call ftruncate.
+ */
+ if (!caller_locked && local_locked == 0) {
+ __wt_spin_lock(session, &block->live_lock);
+ local_locked = 1;
+ }
+ /*
+ * The truncate might fail if there's a file mapping
+ * (if there's an open checkpoint on the file), that's
+ * OK.
+ */
+ if ((ret = __wt_ftruncate(
+ session, fh, offset + fh->extend_len * 2)) == EBUSY)
+ ret = 0;
+ }
+ }
+ /* Release any locally acquired lock. */
+ if (local_locked) {
+ __wt_spin_unlock(session, &block->live_lock);
+ local_locked = 0;
+ }
+ WT_RET(ret);
+
+ /* Write the block. */
+ if ((ret =
+ __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
+ if (!caller_locked)
+ __wt_spin_lock(session, &block->live_lock);
+ WT_TRET(__wt_block_off_free(
+ session, block, offset, (wt_off_t)align_size));
+ if (!caller_locked)
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_RET(ret);
+ }
+
+#ifdef HAVE_SYNC_FILE_RANGE
+ /*
+ * Optionally schedule writes for dirty pages in the system buffer
+ * cache, but only if the current session can wait.
+ */
+ if (block->os_cache_dirty_max != 0 &&
+ (block->os_cache_dirty += align_size) > block->os_cache_dirty_max &&
+ __wt_session_can_wait(session)) {
+ block->os_cache_dirty = 0;
+ WT_RET(__wt_fsync_async(session, fh));
+ }
+#endif
+#ifdef HAVE_POSIX_FADVISE
+ /* Optionally discard blocks from the system buffer cache. */
+ if (block->os_cache_max != 0 &&
+ (block->os_cache += align_size) > block->os_cache_max) {
+ block->os_cache = 0;
+ if ((ret = posix_fadvise(fh->fd,
+ (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0)
+ WT_RET_MSG(
+ session, ret, "%s: posix_fadvise", block->name);
+ }
+#endif
+ WT_STAT_FAST_CONN_INCR(session, block_write);
+ WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);
+
+ WT_RET(__wt_verbose(session, WT_VERB_WRITE,
+ "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
+ (uintmax_t)offset, (uintmax_t)align_size, blk->cksum));
+
+ *offsetp = offset;
+ *sizep = WT_STORE_SIZE(align_size);
+ *cksump = blk->cksum;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/bloom/bloom.c b/src/third_party/wiredtiger/src/bloom/bloom.c
new file mode 100644
index 00000000000..8c8c8bc723e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/bloom/bloom.c
@@ -0,0 +1,351 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define WT_BLOOM_TABLE_CONFIG "key_format=r,value_format=1t,exclusive=true"
+
+/*
+ * __bloom_init --
+ * Allocate a WT_BLOOM handle.
+ */
+static int
+__bloom_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *config, WT_BLOOM **bloomp)
+{
+ WT_BLOOM *bloom;
+ WT_DECL_RET;
+ size_t len;
+
+ *bloomp = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &bloom));
+
+ WT_ERR(__wt_strdup(session, uri, &bloom->uri));
+ len = strlen(WT_BLOOM_TABLE_CONFIG) + 2;
+ if (config != NULL)
+ len += strlen(config);
+ WT_ERR(__wt_calloc_def(session, len, &bloom->config));
+ /* Add the standard config at the end, so it overrides user settings. */
+ (void)snprintf(bloom->config, len,
+ "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG);
+
+ bloom->session = session;
+
+ *bloomp = bloom;
+ return (0);
+
+err: __wt_free(session, bloom->uri);
+ __wt_free(session, bloom->config);
+ __wt_free(session, bloom->bitstring);
+ __wt_free(session, bloom);
+ return (ret);
+}
+
+/*
+ * __bloom_setup --
+ * Populate the bloom structure.
+ *
+ * Setup is passed in either the count of items expected (n), or the length of
+ * the bitstring (m). Depends on whether the function is called via create or
+ * open.
+ */
+static int
+__bloom_setup(
+ WT_BLOOM *bloom, uint64_t n, uint64_t m, uint32_t factor, uint32_t k)
+{
+ if (k < 2)
+ return (EINVAL);
+
+ bloom->k = k;
+ bloom->factor = factor;
+ if (n != 0) {
+ bloom->n = n;
+ bloom->m = bloom->n * bloom->factor;
+ } else {
+ bloom->m = m;
+ bloom->n = bloom->m / bloom->factor;
+ }
+ return (0);
+}
+
+/*
+ * __wt_bloom_create --
+ *
+ * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to
+ * use while populating the bloom filter.
+ *
+ * count - is the expected number of inserted items
+ * factor - is the number of bits to use per inserted item
+ * k - is the number of hash values to set or test per item
+ */
+int
+__wt_bloom_create(
+ WT_SESSION_IMPL *session, const char *uri, const char *config,
+ uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp)
+{
+ WT_BLOOM *bloom;
+ WT_DECL_RET;
+
+ WT_RET(__bloom_init(session, uri, config, &bloom));
+ WT_ERR(__bloom_setup(bloom, count, 0, factor, k));
+
+ WT_ERR(__bit_alloc(session, bloom->m, &bloom->bitstring));
+
+ *bloomp = bloom;
+ return (0);
+
+err: (void)__wt_bloom_close(bloom);
+ return (ret);
+}
+
+/*
+ * __bloom_open_cursor --
+ * Open a cursor to read from a Bloom filter.
+ */
+static int
+__bloom_open_cursor(WT_BLOOM *bloom, WT_CURSOR *owner)
+{
+ WT_CURSOR *c;
+ WT_SESSION_IMPL *session;
+ const char *cfg[3];
+
+ if ((c = bloom->c) != NULL)
+ return (0);
+
+ session = bloom->session;
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = bloom->config;
+ cfg[2] = NULL;
+ c = NULL;
+ WT_RET(__wt_open_cursor(session, bloom->uri, owner, cfg, &c));
+
+ /* XXX Layering violation: bump the cache priority for Bloom filters. */
+ ((WT_CURSOR_BTREE *)c)->btree->evict_priority = WT_EVICT_INT_SKEW;
+
+ bloom->c = c;
+ return (0);
+}
+
+/*
+ * __wt_bloom_open --
+ * Open a Bloom filter object for use by a single session. The filter must
+ * have been created and finalized.
+ */
+int
+__wt_bloom_open(WT_SESSION_IMPL *session,
+ const char *uri, uint32_t factor, uint32_t k,
+ WT_CURSOR *owner, WT_BLOOM **bloomp)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ uint64_t size;
+
+ WT_RET(__bloom_init(session, uri, NULL, &bloom));
+ WT_ERR(__bloom_open_cursor(bloom, owner));
+ c = bloom->c;
+
+ /* Find the largest key, to get the size of the filter. */
+ WT_ERR(c->prev(c));
+ WT_ERR(c->get_key(c, &size));
+ WT_ERR(c->reset(c));
+
+ WT_ERR(__bloom_setup(bloom, 0, size, factor, k));
+
+ *bloomp = bloom;
+ return (0);
+
+err: (void)__wt_bloom_close(bloom);
+ return (ret);
+}
+
+/*
+ * __wt_bloom_insert --
+ * Adds the given key to the Bloom filter.
+ */
+int
+__wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key)
+{
+ uint64_t h1, h2;
+ uint32_t i;
+
+ h1 = __wt_hash_fnv64(key->data, key->size);
+ h2 = __wt_hash_city64(key->data, key->size);
+ for (i = 0; i < bloom->k; i++, h1 += h2) {
+ __bit_set(bloom->bitstring, h1 % bloom->m);
+ }
+ return (0);
+}
+
+/*
+ * __wt_bloom_finalize --
+ * Writes the Bloom filter to stable storage. After calling finalize, only
+ * read operations can be performed on the bloom filter.
+ */
+int
+__wt_bloom_finalize(WT_BLOOM *bloom)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_ITEM values;
+ WT_SESSION *wt_session;
+ uint64_t i;
+
+ wt_session = (WT_SESSION *)bloom->session;
+ WT_CLEAR(values);
+
+ /*
+ * Create a bit table to store the bloom filter in.
+ * TODO: should this call __wt_schema_create directly?
+ */
+ WT_RET(wt_session->create(wt_session, bloom->uri, bloom->config));
+ WT_RET(wt_session->open_cursor(
+ wt_session, bloom->uri, NULL, "bulk=bitmap", &c));
+
+ /* Add the entries from the array into the table. */
+ for (i = 0; i < bloom->m; i += values.size) {
+ /* Adjust bits to bytes for string offset */
+ values.data = bloom->bitstring + (i >> 3);
+ /*
+ * Shave off some bytes for pure paranoia, in case WiredTiger
+ * reserves some special sizes. Choose a value so that if
+ * we do multiple inserts, it will be on an byte boundary.
+ */
+ values.size = (uint32_t)WT_MIN(bloom->m - i, UINT32_MAX - 127);
+ c->set_value(c, &values);
+ WT_ERR(c->insert(c));
+ }
+
+err: WT_TRET(c->close(c));
+ __wt_free(bloom->session, bloom->bitstring);
+ bloom->bitstring = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_bloom_hash --
+ * Calculate the hash values for a given key.
+ */
+int
+__wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash)
+{
+ WT_UNUSED(bloom);
+
+ bhash->h1 = __wt_hash_fnv64(key->data, key->size);
+ bhash->h2 = __wt_hash_city64(key->data, key->size);
+
+ return (0);
+}
+
+/*
+ * __wt_bloom_hash_get --
+ * Tests whether the key (as given by its hash signature) is in the Bloom
+ * filter. Returns zero if found, WT_NOTFOUND if not.
+ */
+int
+__wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ int result;
+ uint32_t i;
+ uint64_t h1, h2;
+ uint8_t bit;
+
+ /* Get operations are only supported by finalized bloom filters. */
+ WT_ASSERT(bloom->session, bloom->bitstring == NULL);
+
+ /* Create a cursor on the first time through. */
+ WT_ERR(__bloom_open_cursor(bloom, NULL));
+ c = bloom->c;
+
+ h1 = bhash->h1;
+ h2 = bhash->h2;
+
+ result = 0;
+ for (i = 0; i < bloom->k; i++, h1 += h2) {
+ /*
+ * Add 1 to the hash because WiredTiger tables are 1 based and
+ * the original bitstring array was 0 based.
+ */
+ c->set_key(c, (h1 % bloom->m) + 1);
+ WT_ERR(c->search(c));
+ WT_ERR(c->get_value(c, &bit));
+
+ if (bit == 0) {
+ result = WT_NOTFOUND;
+ break;
+ }
+ }
+ WT_ERR(c->reset(c));
+ return (result);
+
+err: /* Don't return WT_NOTFOUND from a failed search. */
+ if (ret == WT_NOTFOUND)
+ ret = WT_ERROR;
+ __wt_err(bloom->session, ret, "Failed lookup in bloom filter.");
+ return (ret);
+}
+
+/*
+ * __wt_bloom_get --
+ * Tests whether the given key is in the Bloom filter.
+ * Returns zero if found, WT_NOTFOUND if not.
+ */
+int
+__wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key)
+{
+ WT_BLOOM_HASH bhash;
+
+ WT_RET(__wt_bloom_hash(bloom, key, &bhash));
+ return (__wt_bloom_hash_get(bloom, &bhash));
+}
+
+/*
+ * __wt_bloom_close --
+ * Close the Bloom filter, release any resources.
+ */
+int
+__wt_bloom_close(WT_BLOOM *bloom)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = bloom->session;
+
+ if (bloom->c != NULL)
+ ret = bloom->c->close(bloom->c);
+ __wt_free(session, bloom->uri);
+ __wt_free(session, bloom->config);
+ __wt_free(session, bloom->bitstring);
+ __wt_free(session, bloom);
+
+ return (ret);
+}
+
+/*
+ * __wt_bloom_drop --
+ * Drop a Bloom filter, release any resources.
+ */
+int
+__wt_bloom_drop(WT_BLOOM *bloom, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ wt_session = (WT_SESSION *)bloom->session;
+ if (bloom->c != NULL) {
+ ret = bloom->c->close(bloom->c);
+ bloom->c = NULL;
+ }
+ WT_TRET(wt_session->drop(wt_session, bloom->uri, config));
+ WT_TRET(__wt_bloom_close(bloom));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_compact.c b/src/third_party/wiredtiger/src/btree/bt_compact.c
new file mode 100644
index 00000000000..e81c951e9f6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_compact.c
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __compact_rewrite --
+ * Return if a page needs to be re-written.
+ */
+static int
+__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ *skipp = 1; /* Default skip. */
+
+ bm = S2BT(session)->bm;
+ page = ref->page;
+ mod = page->modify;
+
+ /*
+ * Ignore the root: it may not have a replacement address, and besides,
+ * if anything else gets written, so will it.
+ */
+ if (__wt_ref_is_root(ref))
+ return (0);
+
+ /* Ignore currently dirty pages, they will be written regardless. */
+ if (__wt_page_is_modified(page))
+ return (0);
+
+ /*
+ * If the page is clean, test the original addresses.
+ * If the page is a 1-to-1 replacement, test the replacement addresses.
+ * Ignore empty pages, they get merged into the parent.
+ */
+ if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) {
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ if (addr == NULL)
+ return (0);
+ WT_RET(
+ bm->compact_page_skip(bm, session, addr, addr_size, skipp));
+ } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
+ /*
+ * The page's modification information can change underfoot if
+ * the page is being reconciled, lock the page down.
+ */
+ WT_PAGE_LOCK(session, page);
+ ret = bm->compact_page_skip(bm, session,
+ mod->mod_replace.addr, mod->mod_replace.size, skipp);
+ WT_PAGE_UNLOCK(session, page);
+ WT_RET(ret);
+ }
+ return (0);
+}
+
+/*
+ * __wt_compact --
+ * Compact a file.
+ */
+int
+__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_REF *ref;
+ int block_manager_begin, skip;
+
+ WT_UNUSED(cfg);
+
+ conn = S2C(session);
+ btree = S2BT(session);
+ bm = btree->bm;
+ ref = NULL;
+ block_manager_begin = 0;
+
+ WT_STAT_FAST_DATA_INCR(session, session_compact);
+
+ /*
+ * Check if compaction might be useful -- the API layer will quit trying
+ * to compact the data source if we make no progress, set a flag if the
+ * block layer thinks compaction is possible.
+ */
+ WT_RET(bm->compact_skip(bm, session, &skip));
+ if (skip)
+ return (0);
+
+ /*
+ * Reviewing in-memory pages requires looking at page reconciliation
+ * results, because we care about where the page is stored now, not
+ * where the page was stored when we first read it into the cache.
+ * We need to ensure we don't race with page reconciliation as it's
+ * writing the page modify information.
+ *
+ * There are three ways we call reconciliation: checkpoints, threads
+ * writing leaf pages (usually in preparation for a checkpoint), and
+ * eviction.
+ *
+ * We're holding the schema lock which serializes with checkpoints.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ /*
+ * Get the tree handle's flush lock which blocks threads writing leaf
+ * pages.
+ */
+ __wt_spin_lock(session, &btree->flush_lock);
+
+ /*
+ * That leaves eviction, we don't want to block eviction. Set a flag
+ * so reconciliation knows compaction is running. If reconciliation
+ * sees the flag it locks the page it's writing, we acquire the same
+ * lock when reading the page's modify information, serializing access.
+ * The same page lock blocks work on the page, but compaction is an
+ * uncommon, heavy-weight operation. If it's ever a problem, there's
+ * no reason we couldn't use an entirely separate lock than the page
+ * lock.
+ *
+ * We also need to ensure we don't race with an on-going reconciliation.
+ * After we set the flag, wait for eviction of this file to drain, and
+ * then let eviction continue;
+ */
+ conn->compact_in_memory_pass = 1;
+ WT_ERR(__wt_evict_file_exclusive_on(session));
+ __wt_evict_file_exclusive_off(session);
+
+ /* Start compaction. */
+ WT_ERR(bm->compact_start(bm, session));
+ block_manager_begin = 1;
+
+ /* Walk the tree reviewing pages to see if they should be re-written. */
+ session->compaction = 1;
+ for (;;) {
+ /*
+ * Pages read for compaction aren't "useful"; don't update the
+ * read generation of pages already in memory, and if a page is
+ * read, set its generation to a low value so it is evicted
+ * quickly.
+ */
+ WT_ERR(__wt_tree_walk(session, &ref,
+ WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
+ if (ref == NULL)
+ break;
+
+ WT_ERR(__compact_rewrite(session, ref, &skip));
+ if (skip)
+ continue;
+
+ /* Rewrite the page: mark the page and tree dirty. */
+ WT_ERR(__wt_page_modify_init(session, ref->page));
+ __wt_page_modify_set(session, ref->page);
+
+ WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite);
+ }
+
+err: if (ref != NULL)
+ WT_TRET(__wt_page_release(session, ref, 0));
+
+ if (block_manager_begin)
+ WT_TRET(bm->compact_end(bm, session));
+
+ __wt_spin_unlock(session, &btree->flush_lock);
+
+ conn->compact_in_memory_pass = 0;
+ WT_FULL_BARRIER();
+
+ return (ret);
+}
+
+/*
+ * __wt_compact_page_skip --
+ * Return if compaction requires we read this page.
+ */
+int
+__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+ WT_BM *bm;
+ size_t addr_size;
+ u_int type;
+ const uint8_t *addr;
+
+ *skipp = 0; /* Default to reading. */
+ type = 0; /* Keep compiler quiet. */
+
+ bm = S2BT(session)->bm;
+
+ /*
+ * We aren't holding a hazard pointer, so we can't look at the page
+ * itself, all we can look at is the WT_REF information. If there's no
+ * address, the page isn't on disk, but we have to read internal pages
+ * to walk the tree regardless; throw up our hands and read it.
+ */
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
+ if (addr == NULL)
+ return (0);
+
+ /*
+ * Internal pages must be read to walk the tree; ask the block-manager
+ * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
+ * won't help.
+ */
+ return (type == WT_CELL_ADDR_INT ? 0 :
+ bm->compact_page_skip(bm, session, addr, addr_size, skipp));
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
new file mode 100644
index 00000000000..0cc79776634
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __cursor_fix_append_next --
+ * Return the next entry on the append list.
+ */
+static inline int
+__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
+ return (WT_NOTFOUND);
+ } else
+ if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) &&
+ (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
+ return (WT_NOTFOUND);
+
+ /*
+ * This code looks different from the cursor-previous code. The append
+ * list appears on the last page of the tree, but it may be preceded by
+ * other rows, which means the cursor's recno will be set to a value and
+ * we simply want to increment it. If the cursor's recno is NOT set,
+ * we're starting our iteration in a tree that has only appended items.
+ * In that case, recno will be 0 and happily enough the increment will
+ * set it to 1, which is correct.
+ */
+ __cursor_set_recno(cbt, cbt->recno + 1);
+
+ /*
+ * Fixed-width column store appends are inherently non-transactional.
+ * Even a non-visible update by a concurrent or aborted transaction
+ * changes the effective end of the data. The effect is subtle because
+ * of the blurring between deleted and empty values, but ideally we
+ * would skip all uncommitted changes at the end of the data. This
+ * doesn't apply to variable-width column stores because the implicitly
+ * created records written by reconciliation are deleted and so can be
+ * never seen by a read.
+ *
+ * The problem is that we don't know at this point whether there may be
+ * multiple uncommitted changes at the end of the data, and it would be
+ * expensive to check every time we hit an aborted update. If an
+ * insert is aborted, we simply return zero (empty), regardless of
+ * whether we are at the end of the data.
+ */
+ if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
+ (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+ cbt->v = 0;
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_fix_next --
+ * Move to the next, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_BTREE *btree;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = S2BT(session);
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_fix_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, page->pg_fix_recno);
+ goto new_page;
+ }
+
+ /* Move to the next entry and return the item. */
+ if (cbt->recno >= cbt->last_standard_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno + 1);
+
+new_page:
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+ if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+ cbt->ins = NULL;
+ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd == NULL) {
+ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_var_append_next --
+ * Return the next variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+ goto new_page;
+ }
+
+ for (;;) {
+ cbt->ins = WT_SKIP_NEXT(cbt->ins);
+new_page: if (cbt->ins == NULL)
+ return (WT_NOTFOUND);
+
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_var_next --
+ * Move to the next, variable-length column-store item.
+ */
+static inline int
+__cursor_var_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_COL *cip;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_var_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, page->pg_var_recno);
+ goto new_page;
+ }
+
+ /* Move to the next entry and return the item. */
+ for (;;) {
+ if (cbt->recno >= cbt->last_standard_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno + 1);
+
+new_page: /* Find the matching WT_COL slot. */
+ if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+ return (WT_NOTFOUND);
+ cbt->slot = WT_COL_SLOT(page, cip);
+
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+ upd = cbt->ins == NULL ?
+ NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /*
+ * If we're at the same slot as the last reference and there's
+ * no matching insert list item, re-use the return information
+ * (so encoded items with large repeat counts aren't repeatedly
+ * decoded). Otherwise, unpack the cell and build the return
+ * information.
+ */
+ if (cbt->cip_saved != cip) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ continue;
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type == WT_CELL_DEL)
+ continue;
+ WT_RET(__wt_page_cell_data_ref(
+ session, page, &unpack, &cbt->tmp));
+
+ cbt->cip_saved = cip;
+ }
+ val->data = cbt->tmp.data;
+ val->size = cbt->tmp.size;
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __cursor_row_next --
+ * Move to the next row-store item.
+ */
+static inline int
+__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_INSERT *ins;
+ WT_ITEM *key, *val;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ key = &cbt->iface.key;
+ val = &cbt->iface.value;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part
+ * of the page we're walking (otherwise switching from next to prev
+ * and vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ *
+ * New page configuration.
+ */
+ if (newpage) {
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+ cbt->row_iteration_slot = 1;
+ goto new_insert;
+ }
+
+ /* Move to the next entry and return the item. */
+ for (;;) {
+ /*
+ * Continue traversing any insert list; maintain the insert list
+ * head reference and entry count in case we switch to a cursor
+ * previous movement.
+ */
+ if (cbt->ins != NULL)
+ cbt->ins = WT_SKIP_NEXT(cbt->ins);
+
+new_insert: if ((ins = cbt->ins) != NULL) {
+ if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /* Check for the end of the page. */
+ if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1)
+ return (WT_NOTFOUND);
+ ++cbt->row_iteration_slot;
+
+ /*
+ * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+ * even-numbered slots configure as WT_ROW entries.
+ */
+ if (cbt->row_iteration_slot & 0x01) {
+ cbt->ins_head = WT_ROW_INSERT_SLOT(
+ page, cbt->row_iteration_slot / 2 - 1);
+ cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
+ goto new_insert;
+ }
+ cbt->ins_head = NULL;
+ cbt->ins = NULL;
+
+ cbt->slot = cbt->row_iteration_slot / 2 - 1;
+ rip = &page->pg_row_d[cbt->slot];
+ upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+ if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ return (__cursor_row_slot_return(cbt, rip, upd));
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_iterate_setup --
+ * Initialize a cursor for iteration, usually based on a search.
+ */
+void
+__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next)
+{
+ WT_PAGE *page;
+
+ WT_UNUSED(next);
+
+ /*
+ * We don't currently have to do any setup when we switch between next
+ * and prev calls, but I'm sure we will someday -- I'm leaving support
+ * here for both flags for that reason.
+ */
+ F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
+
+ /*
+ * If we don't have a search page, then we're done, we're starting at
+ * the beginning or end of the tree, not as a result of a search.
+ */
+ if (cbt->ref == NULL)
+ return;
+ page = cbt->ref->page;
+
+ if (page->type == WT_PAGE_ROW_LEAF) {
+ /*
+ * For row-store pages, we need a single item that tells us the
+ * part of the page we're walking (otherwise switching from next
+ * to prev and vice-versa is just too complicated), so we map
+ * the WT_ROW and WT_INSERT_HEAD insert array slots into a
+ * single name space: slot 1 is the "smallest key insert list",
+ * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on.
+ * This means WT_INSERT lists are odd-numbered slots, and WT_ROW
+ * array slots are even-numbered slots.
+ */
+ cbt->row_iteration_slot = (cbt->slot + 1) * 2;
+ if (cbt->ins_head != NULL) {
+ if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page))
+ cbt->row_iteration_slot = 1;
+ else
+ cbt->row_iteration_slot += 1;
+ }
+ } else {
+ /*
+ * For column-store pages, calculate the largest record on the
+ * page.
+ */
+ cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ?
+ __col_var_last_recno(page) : __col_fix_last_recno(page);
+
+ /* If we're traversing the append list, set the reference. */
+ if (cbt->ins_head != NULL &&
+ cbt->ins_head == WT_COL_APPEND(page))
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ }
+}
+
+/*
+ * __wt_btcur_next --
+ * Move to the next record in the tree.
+ */
+int
+__wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+ int newpage;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+ flags = WT_READ_SKIP_INTL; /* Tree walk flags. */
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
+
+ WT_RET(__cursor_func_init(cbt, 0));
+
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT))
+ __wt_btcur_iterate_setup(cbt, 1);
+
+ /*
+ * Walk any page we're holding until the underlying call returns not-
+ * found. Then, move to the next page, until we reach the end of the
+ * file.
+ */
+ page = cbt->ref == NULL ? NULL : cbt->ref->page;
+ for (newpage = 0;; newpage = 1) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_append_next(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_append_next(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ } else if (page != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_next(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_next(cbt, newpage);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __cursor_row_next(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret != WT_NOTFOUND)
+ break;
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins_head = WT_COL_APPEND(page)) != NULL) {
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ continue;
+ }
+ }
+
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+ WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+ page = cbt->ref->page;
+ WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
new file mode 100644
index 00000000000..8de784d1f1d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -0,0 +1,560 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Walking backwards through skip lists.
+ *
+ * The skip list stack is an array of pointers set up by a search. It points
+ * to the position a node should go in the skip list. In other words, the skip
+ * list search stack always points *after* the search item (that is, into the
+ * search item's next array).
+ *
+ * Helper macros to go from a stack pointer at level i, pointing into a next
+ * array, back to the insert node containing that next array.
+ */
+#undef PREV_ITEM
+#define PREV_ITEM(ins_head, insp, i) \
+ (((insp) == &(ins_head)->head[i] || (insp) == NULL) ? NULL : \
+ (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next)))
+
+#undef PREV_INS
+#define PREV_INS(cbt, i) \
+ PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i))
+
+/*
+ * __cursor_skip_prev --
+ * Move back one position in a skip list stack (aka "finger").
+ */
+static inline int
+__cursor_skip_prev(WT_CURSOR_BTREE *cbt)
+{
+ WT_INSERT *current, *ins;
+ WT_ITEM key;
+ WT_SESSION_IMPL *session;
+ int i;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+restart:
+ /*
+ * If the search stack does not point at the current item, fill it in
+ * with a search.
+ */
+ while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
+ if (cbt->btree->type == BTREE_ROW) {
+ key.data = WT_INSERT_KEY(current);
+ key.size = WT_INSERT_KEY_SIZE(current);
+ WT_RET(__wt_search_insert(session, cbt, &key));
+ } else
+ cbt->ins = __col_insert_search(cbt->ins_head,
+ cbt->ins_stack, cbt->next_stack,
+ WT_INSERT_RECNO(current));
+ }
+
+ /*
+ * Find the first node up the search stack that does not move.
+ *
+ * The depth of the current item must be at least this level, since we
+ * see it in that many levels of the stack.
+ *
+ * !!! Watch these loops carefully: they all rely on the value of i,
+ * and the exit conditions to end up with the right values are
+ * non-trivial.
+ */
+ ins = NULL; /* -Wconditional-uninitialized */
+ for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++)
+ if ((ins = PREV_INS(cbt, i + 1)) != current)
+ break;
+
+ /*
+ * Find a starting point for the new search. That is either at the
+ * non-moving node if we found a valid node, or the beginning of the
+ * next list down that is not the current node.
+ *
+ * Since it is the beginning of a list, and we know the current node is
+ * has a skip depth at least this high, any node we find must sort
+ * before the current node.
+ */
+ if (ins == NULL || ins == current)
+ for (; i >= 0; i--) {
+ cbt->ins_stack[i] = NULL;
+ cbt->next_stack[i] = NULL;
+ ins = cbt->ins_head->head[i];
+ if (ins != NULL && ins != current)
+ break;
+ }
+
+ /* Walk any remaining levels until just before the current node. */
+ while (i >= 0) {
+ /*
+ * If we get to the end of a list without finding the current
+ * item, we must have raced with an insert. Restart the search.
+ */
+ if (ins == NULL) {
+ cbt->ins_stack[0] = NULL;
+ cbt->next_stack[0] = NULL;
+ goto restart;
+ }
+ if (ins->next[i] != current) /* Stay at this level */
+ ins = ins->next[i];
+ else { /* Drop down a level */
+ cbt->ins_stack[i] = &ins->next[i];
+ cbt->next_stack[i] = ins->next[i];
+ --i;
+ }
+ }
+
+ /* If we found a previous node, the next one must be current. */
+ if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
+ goto restart;
+
+ cbt->ins = PREV_INS(cbt, 0);
+ return (0);
+}
+
+/*
+ * __cursor_fix_append_prev --
+ * Return the previous fixed-length entry on the append list.
+ */
+static inline int
+__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
+ return (WT_NOTFOUND);
+ } else {
+ /*
+ * Handle the special case of leading implicit records, that is,
+ * there aren't any records in the tree not on the append list,
+ * and the first record on the append list isn't record 1.
+ *
+ * The "right" place to handle this is probably in our caller.
+ * The high-level cursor-previous routine would:
+ * -- call this routine to walk the append list
+ * -- call the routine to walk the standard page items
+ * -- call the tree walk routine looking for a previous page
+ * Each of them returns WT_NOTFOUND, at which point our caller
+ * checks the cursor record number, and if it's larger than 1,
+ * returns the implicit records. Instead, I'm trying to detect
+ * the case here, mostly because I don't want to put that code
+ * into our caller. Anyway, if this code breaks for any reason,
+ * that's the way I'd go.
+ *
+ * If we're not pointing to a WT_INSERT entry, or we can't find
+ * a WT_INSERT record that precedes our record name-space, check
+ * if there are any records on the page. If there aren't, then
+ * we're in the magic zone, keep going until we get to a record
+ * number of 1.
+ */
+ if (cbt->ins != NULL &&
+ cbt->recno <= WT_INSERT_RECNO(cbt->ins))
+ WT_RET(__cursor_skip_prev(cbt));
+ if (cbt->ins == NULL &&
+ (cbt->recno == 1 || __col_fix_last_recno(page) != 0))
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * This code looks different from the cursor-next code. The append
+ * list appears on the last page of the tree and contains the last
+ * records in the tree. If we're iterating through the tree, starting
+ * at the last record in the tree, by definition we're starting a new
+ * iteration and we set the record number to the last record found in
+ * the tree. Otherwise, decrement the record.
+ */
+ if (newpage)
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ else
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+ /*
+ * Fixed-width column store appends are inherently non-transactional.
+ * Even a non-visible update by a concurrent or aborted transaction
+ * changes the effective end of the data. The effect is subtle because
+ * of the blurring between deleted and empty values, but ideally we
+ * would skip all uncommitted changes at the end of the data. This
+ * doesn't apply to variable-width column stores because the implicitly
+ * created records written by reconciliation are deleted and so can be
+ * never seen by a read.
+ */
+ if (cbt->ins == NULL ||
+ cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
+ (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
+ cbt->v = 0;
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_fix_prev --
+ * Move to the previous, fixed-length column-store item.
+ */
+static inline int
+__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_BTREE *btree;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ btree = S2BT(session);
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_fix_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->last_standard_recno);
+ goto new_page;
+ }
+
+ /* Move to the previous entry and return the item. */
+ if (cbt->recno == page->pg_fix_recno)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page:
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
+ if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
+ cbt->ins = NULL;
+ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd == NULL) {
+ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ val->data = &cbt->v;
+ } else
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = 1;
+ return (0);
+}
+
+/*
+ * __cursor_var_append_prev --
+ * Return the previous variable-length entry on the append list.
+ */
+static inline int
+__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_ITEM *val;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ val = &cbt->iface.value;
+
+ if (newpage) {
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ goto new_page;
+ }
+
+ for (;;) {
+ WT_RET(__cursor_skip_prev(cbt));
+new_page: if (cbt->ins == NULL)
+ return (WT_NOTFOUND);
+
+ __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
+ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_var_prev --
+ * Move to the previous, variable-length column-store item.
+ */
+static inline int
+__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_COL *cip;
+ WT_ITEM *val;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ val = &cbt->iface.value;
+
+ /* Initialize for each new page. */
+ if (newpage) {
+ cbt->last_standard_recno = __col_var_last_recno(page);
+ if (cbt->last_standard_recno == 0)
+ return (WT_NOTFOUND);
+ __cursor_set_recno(cbt, cbt->last_standard_recno);
+ goto new_page;
+ }
+
+ /* Move to the previous entry and return the item. */
+ for (;;) {
+ __cursor_set_recno(cbt, cbt->recno - 1);
+
+new_page: if (cbt->recno < page->pg_var_recno)
+ return (WT_NOTFOUND);
+
+ /* Find the matching WT_COL slot. */
+ if ((cip = __col_var_search(page, cbt->recno)) == NULL)
+ return (WT_NOTFOUND);
+ cbt->slot = WT_COL_SLOT(page, cip);
+
+ /* Check any insert list for a matching record. */
+ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
+ upd = cbt->ins == NULL ?
+ NULL : __wt_txn_read(session, cbt->ins->upd);
+ if (upd != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /*
+ * If we're at the same slot as the last reference and there's
+ * no matching insert list item, re-use the return information
+ * (so encoded items with large repeat counts aren't repeatedly
+ * decoded). Otherwise, unpack the cell and build the return
+ * information.
+ */
+ if (cbt->cip_saved != cip) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ continue;
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type == WT_CELL_DEL)
+ continue;
+ WT_RET(__wt_page_cell_data_ref(
+ session, page, &unpack, &cbt->tmp));
+
+ cbt->cip_saved = cip;
+ }
+ val->data = cbt->tmp.data;
+ val->size = cbt->tmp.size;
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __cursor_row_prev --
+ * Move to the previous row-store item.
+ */
+static inline int
+__cursor_row_prev(WT_CURSOR_BTREE *cbt, int newpage)
+{
+ WT_INSERT *ins;
+ WT_ITEM *key, *val;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ key = &cbt->iface.key;
+ val = &cbt->iface.value;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part
+ * of the page we're walking (otherwise switching from next to prev
+ * and vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ *
+ * New page configuration.
+ */
+ if (newpage) {
+ /*
+ * If we haven't instantiated keys on this page, do so, else it
+ * is a very, very slow traversal.
+ */
+ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+ WT_RET(__wt_row_leaf_keys(session, page));
+
+ if (page->pg_row_entries == 0)
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ else
+ cbt->ins_head =
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ cbt->row_iteration_slot = page->pg_row_entries * 2 + 1;
+ goto new_insert;
+ }
+
+ /* Move to the previous entry and return the item. */
+ for (;;) {
+ /*
+ * Continue traversing any insert list. Maintain the reference
+ * to the current insert element in case we switch to a cursor
+ * next movement.
+ */
+ if (cbt->ins != NULL)
+ WT_RET(__cursor_skip_prev(cbt));
+
+new_insert: if ((ins = cbt->ins) != NULL) {
+ if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
+ WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ val->data = WT_UPDATE_DATA(upd);
+ val->size = upd->size;
+ return (0);
+ }
+
+ /* Check for the beginning of the page. */
+ if (cbt->row_iteration_slot == 1)
+ return (WT_NOTFOUND);
+ --cbt->row_iteration_slot;
+
+ /*
+ * Odd-numbered slots configure as WT_INSERT_HEAD entries,
+ * even-numbered slots configure as WT_ROW entries.
+ */
+ if (cbt->row_iteration_slot & 0x01) {
+ cbt->ins_head = cbt->row_iteration_slot == 1 ?
+ WT_ROW_INSERT_SMALLEST(page) :
+ WT_ROW_INSERT_SLOT(
+ page, cbt->row_iteration_slot / 2 - 1);
+ cbt->ins = WT_SKIP_LAST(cbt->ins_head);
+ goto new_insert;
+ }
+ cbt->ins_head = NULL;
+ cbt->ins = NULL;
+
+ cbt->slot = cbt->row_iteration_slot / 2 - 1;
+ rip = &page->pg_row_d[cbt->slot];
+ upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
+ if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ return (__cursor_row_slot_return(cbt, rip, upd));
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_btcur_prev --
+ * Move to the previous record in the tree.
+ */
+int
+__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+ int newpage;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+ WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+ flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */
+ if (truncating)
+ LF_SET(WT_READ_TRUNCATE);
+
+ WT_RET(__cursor_func_init(cbt, 0));
+
+ /*
+ * If we aren't already iterating in the right direction, there's
+ * some setup to do.
+ */
+ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
+ __wt_btcur_iterate_setup(cbt, 0);
+
+ /*
+ * Walk any page we're holding until the underlying call returns not-
+ * found. Then, move to the previous page, until we reach the start
+ * of the file.
+ */
+ page = cbt->ref == NULL ? NULL : cbt->ref->page;
+ for (newpage = 0;; newpage = 1) {
+ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_append_prev(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_append_prev(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret == 0)
+ break;
+ F_CLR(cbt, WT_CBT_ITERATE_APPEND);
+ if (ret != WT_NOTFOUND)
+ break;
+ newpage = 1;
+ }
+ if (page != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ ret = __cursor_fix_prev(cbt, newpage);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __cursor_var_prev(cbt, newpage);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __cursor_row_prev(cbt, newpage);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ if (ret != WT_NOTFOUND)
+ break;
+ }
+
+ WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
+ WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
+
+ page = cbt->ref->page;
+ WT_ASSERT(session, !WT_PAGE_IS_INTERNAL(page));
+
+ /*
+ * The last page in a column-store has appended entries.
+ * We handle it separately from the usual cursor code:
+ * it's only that one page and it's in a simple format.
+ */
+ if (page->type != WT_PAGE_ROW_LEAF &&
+ (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
+ F_SET(cbt, WT_CBT_ITERATE_APPEND);
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
new file mode 100644
index 00000000000..5b2d9b055b5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -0,0 +1,1025 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __cursor_size_chk --
+ * Return if an inserted item is too large.
+ */
+static inline int
+__cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ size_t size;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ if (btree->type == BTREE_COL_FIX) {
+ /* Fixed-size column-stores take a single byte. */
+ if (kv->size != 1)
+ WT_RET_MSG(session, EINVAL,
+ "item size of %" WT_SIZET_FMT " does not match "
+ "fixed-length file requirement of 1 byte",
+ kv->size);
+ return (0);
+ }
+
+ /* Don't waste effort, 1GB is always cool. */
+ if (kv->size <= WT_GIGABYTE)
+ return (0);
+
+ /*
+ * There are two checks: what we are willing to store in the tree, and
+ * what the block manager can actually write.
+ */
+ if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
+ ret = EINVAL;
+ else {
+ size = kv->size;
+ ret = bm->write_size(bm, session, &size);
+ }
+ if (ret != 0)
+ WT_RET_MSG(session, ret,
+ "item size of %" WT_SIZET_FMT " exceeds the maximum "
+ "supported size",
+ kv->size);
+ return (0);
+}
+
+/*
+ * __cursor_fix_implicit --
+ * Return if search went past the end of the tree.
+ */
+static inline int
+__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
+{
+ return (btree->type == BTREE_COL_FIX &&
+ !F_ISSET(cbt, WT_CBT_MAX_RECORD) ? 1 : 0);
+}
+
+/*
+ * __cursor_valid --
+ * Return if the cursor references an valid key/value pair.
+ */
+static inline int
+__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_COL *cip;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ btree = cbt->btree;
+ page = cbt->ref->page;
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ if (updp != NULL)
+ *updp = NULL;
+
+ /*
+ * We may be pointing to an insert object, and we may have a page with
+ * existing entries. Insert objects always have associated update
+ * objects (the value). Any update object may be deleted, or invisible
+ * to us. In the case of an on-page entry, there is by definition a
+ * value that is visible to us, the original page cell.
+ *
+ * If we find a visible update structure, return our caller a reference
+ * to it because we don't want to repeatedly search for the update, it
+ * might suddenly become invisible (imagine a read-uncommitted session
+ * with another session's aborted insert), and we don't want to handle
+ * that potential error every time we look at the value.
+ *
+ * Unfortunately, the objects we might have and their relationships are
+ * different for the underlying page types.
+ *
+ * In the case of row-store, an insert object implies ignoring any page
+ * objects, no insert object can have the same key as an on-page object.
+ * For row-store:
+ * if there's an insert object:
+ * if there's a visible update:
+ * exact match
+ * else
+ * no exact match
+ * else
+ * use the on-page object (which may have an associated
+ * update object that may or may not be visible to us).
+ *
+ * Column-store is more complicated because an insert object can have
+ * the same key as an on-page object: updates to column-store rows
+ * are insert/object pairs, and an invisible update isn't the end as
+ * there may be an on-page object that is visible. This changes the
+ * logic to:
+ * if there's an insert object:
+ * if there's a visible update:
+ * exact match
+ * else if the on-page object's key matches the insert key
+ * use the on-page object
+ * else
+ * use the on-page object
+ *
+ * First, check for an insert object with a visible update (a visible
+ * update that's been deleted is not a valid key/value pair).
+ */
+ if (cbt->ins != NULL &&
+ (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ return (0);
+ if (updp != NULL)
+ *updp = upd;
+ return (1);
+ }
+
+ /*
+ * If we don't have an insert object, or in the case of column-store,
+ * there's an insert object but no update was visible to us and the key
+ * on the page is the same as the insert object's key, and the slot as
+ * set by the search function is valid, we can use the original page
+ * information.
+ */
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ /*
+ * If search returned an insert object, there may or may not be
+ * a matching on-page object, we have to check. Fixed-length
+ * column-store pages don't have slots, but map one-to-one to
+ * keys, check for retrieval past the end of the page.
+ */
+ if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
+ return (0);
+
+ /*
+ * Updates aren't stored on the page, an update would have
+ * appeared as an "insert" object; no further checks to do.
+ */
+ break;
+ case BTREE_COL_VAR:
+ /*
+ * If search returned an insert object, there may or may not be
+ * a matching on-page object, we have to check. Variable-length
+ * column-store pages don't map one-to-one to keys, but have
+ * "slots", check if search returned a valid slot.
+ */
+ if (cbt->slot >= page->pg_var_entries)
+ return (0);
+
+ /*
+ * Updates aren't stored on the page, an update would have
+ * appeared as an "insert" object; however, variable-length
+ * column store deletes are written into the backing store,
+ * check the cell for a record already deleted when read.
+ */
+ cip = &page->pg_var_d[cbt->slot];
+ if ((cell = WT_COL_PTR(page, cip)) == NULL ||
+ __wt_cell_type(cell) == WT_CELL_DEL)
+ return (0);
+ break;
+ case BTREE_ROW:
+ /*
+ * See above: for row-store, no insert object can have the same
+ * key as an on-page object, we're done.
+ */
+ if (cbt->ins != NULL)
+ return (0);
+
+ /*
+ * Check if searched returned a valid slot (the failure mode is
+ * an empty page, the search function doesn't check, and so the
+ * more exact test is "page->pg_row_entries == 0", but this test
+ * mirrors the column-store test).
+ */
+ if (cbt->slot >= page->pg_row_entries)
+ return (0);
+
+ /* Updates are stored on the page, check for a delete. */
+ if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
+ session, page->pg_row_upd[cbt->slot])) != NULL) {
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ return (0);
+ if (updp != NULL)
+ *updp = upd;
+ }
+ break;
+ }
+ return (1);
+}
+
+/*
+ * __cursor_col_search --
+ * Column-store search from an application cursor.
+ */
+static inline int
+__cursor_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_col_search(session, cbt->iface.recno, NULL, cbt));
+ return (ret);
+}
+
+/*
+ * __cursor_row_search --
+ * Row-store search from an application cursor.
+ */
+static inline int
+__cursor_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int insert)
+{
+ WT_DECL_RET;
+
+ WT_WITH_PAGE_INDEX(session,
+ ret = __wt_row_search(session, &cbt->iface.key, NULL, cbt, insert));
+ return (ret);
+}
+
+/*
+ * __cursor_col_modify --
+ * Column-store delete, insert, and update from an application cursor.
+ */
+static inline int
+__cursor_col_modify(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+ return (__wt_col_modify(session,
+ cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __cursor_row_modify --
+ * Row-store insert, update and delete from an application cursor.
+ */
+static inline int
+__cursor_row_modify(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+ return (__wt_row_modify(session,
+ cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __wt_btcur_reset --
+ * Invalidate the cursor position.
+ */
+int
+__wt_btcur_reset(WT_CURSOR_BTREE *cbt)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_reset);
+ WT_STAT_FAST_DATA_INCR(session, cursor_reset);
+
+ return (__cursor_reset(cbt));
+}
+
+/*
+ * __wt_btcur_search --
+ * Search for a matching record in the tree.
+ */
+int
+__wt_btcur_search(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+
+ WT_RET(__cursor_func_init(cbt, 1));
+
+ WT_ERR(btree->type == BTREE_ROW ?
+ __cursor_row_search(session, cbt, 0) :
+ __cursor_col_search(session, cbt));
+ if (cbt->compare == 0 && __cursor_valid(cbt, &upd))
+ ret = __wt_kv_return(session, cbt, upd);
+ else if (__cursor_fix_implicit(btree, cbt)) {
+ /*
+ * Creating a record past the end of the tree in a fixed-length
+ * column-store implicitly fills the gap with empty records.
+ */
+ cbt->recno = cursor->recno;
+ cbt->v = 0;
+ cursor->value.data = &cbt->v;
+ cursor->value.size = 1;
+ } else
+ ret = WT_NOTFOUND;
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_search_near --
+ * Search for a record in the tree.
+ */
+int
+__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ int exact;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+ exact = 0;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+
+ WT_RET(__cursor_func_init(cbt, 1));
+
+ /*
+ * Set the "insert" flag for the btree row-store search; we may intend
+ * to position our cursor at the end of the tree, rather than match an
+ * existing record.
+ */
+ WT_ERR(btree->type == BTREE_ROW ?
+ __cursor_row_search(session, cbt, 1) :
+ __cursor_col_search(session, cbt));
+
+ /*
+ * If we find an valid key, return it.
+ *
+ * Else, creating a record past the end of the tree in a fixed-length
+ * column-store implicitly fills the gap with empty records. In this
+ * case, we instantiate the empty record, it's an exact match.
+ *
+ * Else, move to the next key in the tree (bias for prefix searches).
+ * Cursor next skips invalid rows, so we don't have to test for them
+ * again.
+ *
+ * Else, redo the search and move to the previous key in the tree.
+ * Cursor previous skips invalid rows, so we don't have to test for
+ * them again.
+ *
+ * If that fails, quit, there's no record to return.
+ */
+ if (__cursor_valid(cbt, &upd)) {
+ exact = cbt->compare;
+ ret = __wt_kv_return(session, cbt, upd);
+ } else if (__cursor_fix_implicit(btree, cbt)) {
+ cbt->recno = cursor->recno;
+ cbt->v = 0;
+ cursor->value.data = &cbt->v;
+ cursor->value.size = 1;
+ exact = 0;
+ } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND)
+ exact = 1;
+ else {
+ WT_ERR(btree->type == BTREE_ROW ?
+ __cursor_row_search(session, cbt, 1) :
+ __cursor_col_search(session, cbt));
+ if (__cursor_valid(cbt, &upd)) {
+ exact = cbt->compare;
+ ret = __wt_kv_return(session, cbt, upd);
+ } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND)
+ exact = -1;
+ }
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
+ *exactp = exact;
+ return (ret);
+}
+
+/*
+ * __wt_btcur_insert --
+ * Insert a record into the tree.
+ */
+int
+__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCRV(session,
+ cursor_insert_bytes, cursor->key.size + cursor->value.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+ WT_RET(__cursor_size_chk(session, &cursor->value));
+
+ /*
+ * The tree is no longer empty: eviction should pay attention to it,
+ * and it's no longer possible to bulk-load into it.
+ */
+ if (btree->bulk_load_ok) {
+ btree->bulk_load_ok = 0;
+ __wt_btree_evictable(session, 1);
+ }
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ /*
+ * If WT_CURSTD_APPEND is set, insert a new record (ignoring
+ * the application's record number). First we search for the
+ * maximum possible record number so the search ends on the
+ * last page. The real record number is assigned by the
+ * serialized append operation.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ cbt->iface.recno = UINT64_MAX;
+
+ WT_ERR(__cursor_col_search(session, cbt));
+
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ cbt->iface.recno = 0;
+
+ /*
+ * If not overwriting, fail if the key exists. Creating a
+ * record past the end of the tree in a fixed-length
+ * column-store implicitly fills the gap with empty records.
+ * Fail in that case, the record exists.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) ||
+ (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
+ WT_ERR(WT_DUPLICATE_KEY);
+
+ WT_ERR(__cursor_col_modify(session, cbt, 0));
+ if (F_ISSET(cursor, WT_CURSTD_APPEND))
+ cbt->iface.recno = cbt->recno;
+ break;
+ case BTREE_ROW:
+ WT_ERR(__cursor_row_search(session, cbt, 1));
+ /*
+ * If not overwriting, fail if the key exists, else insert the
+ * key/value pair.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ cbt->compare == 0 && __cursor_valid(cbt, NULL))
+ WT_ERR(WT_DUPLICATE_KEY);
+
+ ret = __cursor_row_modify(session, cbt, 0);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+ /* Insert doesn't maintain a position across calls, clear resources. */
+ if (ret == 0)
+ WT_TRET(__curfile_leave(cbt));
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_update_check --
+ * Check whether an update would conflict.
+ *
+ * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so
+ * they only check for conflicts without updating the tree. It is used to
+ * maintain snapshot isolation for transactions that span multiple chunks
+ * in an LSM tree.
+ */
+int
+__wt_btcur_update_check(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cursor = &cbt->iface;
+ btree = cbt->btree;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_ROW:
+ WT_ERR(__cursor_row_search(session, cbt, 1));
+
+ /*
+ * We are only interested in checking for conflicts.
+ */
+ if (cbt->compare == 0 && cbt->ins != NULL)
+ ret = __wt_txn_update_check(session, cbt->ins->upd);
+ break;
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+ WT_TRET(__curfile_leave(cbt));
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_remove --
+ * Remove a record from the tree.
+ */
+int
+__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ERR(__cursor_col_search(session, cbt));
+
+ /* Remove the record if it exists. */
+ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) {
+ if (!__cursor_fix_implicit(btree, cbt))
+ WT_ERR(WT_NOTFOUND);
+ /*
+ * Creating a record past the end of the tree in a
+ * fixed-length column-store implicitly fills the
+ * gap with empty records. Return success in that
+ * case, the record was deleted successfully.
+ *
+ * Correct the btree cursor's location: the search
+ * will have pointed us at the previous/next item,
+ * and that's not correct.
+ */
+ cbt->recno = cursor->recno;
+ } else
+ ret = __cursor_col_modify(session, cbt, 1);
+ break;
+ case BTREE_ROW:
+ /* Remove the record if it exists. */
+ WT_ERR(__cursor_row_search(session, cbt, 0));
+ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL))
+ WT_ERR(WT_NOTFOUND);
+
+ ret = __cursor_row_modify(session, cbt, 1);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+ /*
+ * If the cursor is configured to overwrite and the record is not
+ * found, that is exactly what we want.
+ */
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+
+ return (ret);
+}
+
+/*
+ * __wt_btcur_update --
+ * Update a record in the tree.
+ */
+int
+__wt_btcur_update(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCRV(
+ session, cursor_update_bytes, cursor->value.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+ WT_RET(__cursor_size_chk(session, &cursor->value));
+
+ /*
+ * The tree is no longer empty: eviction should pay attention to it,
+ * and it's no longer possible to bulk-load into it.
+ */
+ if (btree->bulk_load_ok) {
+ btree->bulk_load_ok = 0;
+ __wt_btree_evictable(session, 1);
+ }
+
+retry: WT_RET(__cursor_func_init(cbt, 1));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ERR(__cursor_col_search(session, cbt));
+
+ /*
+ * If not overwriting, fail if the key doesn't exist. Update
+ * the record if it exists. Creating a record past the end of
+ * the tree in a fixed-length column-store implicitly fills the
+ * gap with empty records. Update the record in that case, the
+ * record exists.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) &&
+ !__cursor_fix_implicit(btree, cbt))
+ WT_ERR(WT_NOTFOUND);
+ ret = __cursor_col_modify(session, cbt, 0);
+ break;
+ case BTREE_ROW:
+ WT_ERR(__cursor_row_search(session, cbt, 1));
+ /*
+ * If not overwriting, fail if the key does not exist.
+ */
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (cbt->compare != 0 || !__cursor_valid(cbt, NULL)))
+ WT_ERR(WT_NOTFOUND);
+ ret = __cursor_row_modify(session, cbt, 0);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: if (ret == WT_RESTART)
+ goto retry;
+
+ /*
+ * If successful, point the cursor at internal copies of the data. We
+ * could shuffle memory in the cursor so the key/value pair are in local
+ * buffer memory, but that's a data copy. We don't want to do another
+ * search (and we might get a different update structure if we race).
+ * To make this work, we add a field to the btree cursor to pass back a
+ * pointer to the modify function's allocated update structure.
+ */
+ if (ret == 0)
+ WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update));
+
+ if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_next_random --
+ * Move to a random record in the tree.
+ */
+int
+__wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * Only supports row-store: applications can trivially select a random
+ * value from a column-store, if there were any reason to do so.
+ */
+ if (btree->type != BTREE_ROW)
+ WT_RET(ENOTSUP);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+ WT_RET(__cursor_func_init(cbt, 1));
+
+ WT_ERR(__wt_row_random(session, cbt));
+ if (__cursor_valid(cbt, &upd))
+ WT_ERR(__wt_kv_return(session, cbt, upd));
+ else
+ WT_ERR(__wt_btcur_search_near(cbt, 0));
+
+err: if (ret != 0)
+ WT_TRET(__cursor_reset(cbt));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_compare --
+ * Return a comparison between two cursors.
+ */
+int
+__wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *a, *b;
+ WT_SESSION_IMPL *session;
+
+ a = (WT_CURSOR *)a_arg;
+ b = (WT_CURSOR *)b_arg;
+ btree = a_arg->btree;
+ session = (WT_SESSION_IMPL *)a->session;
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ /*
+ * Compare the interface's cursor record, not the underlying
+ * cursor reference: the interface's cursor reference is the
+ * one being returned to the application.
+ */
+ if (a->recno < b->recno)
+ *cmpp = -1;
+ else if (a->recno == b->recno)
+ *cmpp = 0;
+ else
+ *cmpp = 1;
+ break;
+ case BTREE_ROW:
+ WT_RET(__wt_compare(
+ session, btree->collator, &a->key, &b->key, cmpp));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __cursor_equals --
+ * Return if two cursors reference the same row.
+ */
+static int
+__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
+{
+ switch (a->btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ /*
+ * Compare the interface's cursor record, not the underlying
+ * cursor reference: the interface's cursor reference is the
+ * one being returned to the application.
+ */
+ if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
+ return (1);
+ break;
+ case BTREE_ROW:
+ if (a->ref != b->ref)
+ return (0);
+ if (a->ins != NULL || b->ins != NULL) {
+ if (a->ins == b->ins)
+ return (1);
+ break;
+ }
+ if (a->slot == b->slot)
+ return (1);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __cursor_truncate --
+ * Discard a cursor range from row-store or variable-width column-store
+ * tree.
+ */
+static int
+__cursor_truncate(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
+ int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int))
+{
+ WT_DECL_RET;
+
+ /*
+ * First, call the standard cursor remove method to do a full search and
+ * re-position the cursor because we don't have a saved copy of the
+ * page's write generation information, which we need to remove records.
+ * Once that's done, we can delete records without a full search, unless
+ * we encounter a restart error because the page was modified by some
+ * other thread of control; in that case, repeat the full search to
+ * refresh the page's modification information.
+ *
+ * If this is a row-store, we delete leaf pages having no overflow items
+ * without reading them; for that to work, we have to ensure we read the
+ * page referenced by the ending cursor, since we may be deleting only a
+ * partial page at the end of the truncation. Our caller already fully
+ * instantiated the end cursor, so we know that page is pinned in memory
+ * and we can proceed without concern.
+ */
+ if (start == NULL) {
+ do {
+ WT_RET(__wt_btcur_remove(stop));
+ for (;;) {
+ if ((ret = __wt_btcur_prev(stop, 1)) != 0)
+ break;
+ stop->compare = 0; /* Exact match */
+ if ((ret = rmfunc(session, stop, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ } else {
+ do {
+ WT_RET(__wt_btcur_remove(start));
+ for (;;) {
+ if (stop != NULL &&
+ __cursor_equals(start, stop))
+ break;
+ if ((ret = __wt_btcur_next(start, 1)) != 0)
+ break;
+ start->compare = 0; /* Exact match */
+ if ((ret = rmfunc(session, start, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ }
+
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
+}
+
+/*
+ * __cursor_truncate_fix --
+ * Discard a cursor range from fixed-width column-store tree.
+ */
+static int
+__cursor_truncate_fix(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
+ int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int))
+{
+ WT_DECL_RET;
+ uint8_t *value;
+
+ /*
+ * Handle fixed-length column-store objects separately: for row-store
+ * and variable-length column-store objects we have "deleted" values
+ * and so returned objects actually exist: fixed-length column-store
+ * objects are filled-in if they don't exist, that is, if you create
+ * record 37, records 1-36 magically appear. Those records can't be
+ * deleted, which means we have to ignore already "deleted" records.
+ *
+ * First, call the standard cursor remove method to do a full search and
+ * re-position the cursor because we don't have a saved copy of the
+ * page's write generation information, which we need to remove records.
+ * Once that's done, we can delete records without a full search, unless
+ * we encounter a restart error because the page was modified by some
+ * other thread of control; in that case, repeat the full search to
+ * refresh the page's modification information.
+ */
+ if (start == NULL) {
+ do {
+ WT_RET(__wt_btcur_remove(stop));
+ for (;;) {
+ if ((ret = __wt_btcur_prev(stop, 1)) != 0)
+ break;
+ stop->compare = 0; /* Exact match */
+ value = (uint8_t *)stop->iface.value.data;
+ if (*value != 0 &&
+ (ret = rmfunc(session, stop, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ } else {
+ do {
+ WT_RET(__wt_btcur_remove(start));
+ for (;;) {
+ if (stop != NULL &&
+ __cursor_equals(start, stop))
+ break;
+ if ((ret = __wt_btcur_next(start, 1)) != 0)
+ break;
+ start->compare = 0; /* Exact match */
+ value = (uint8_t *)start->iface.value.data;
+ if (*value != 0 &&
+ (ret = rmfunc(session, start, 1)) != 0)
+ break;
+ }
+ } while (ret == WT_RESTART);
+ }
+
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
+}
+
+/*
+ * __wt_btcur_range_truncate --
+ * Discard a cursor range from the tree.
+ */
+int
+__wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (start != NULL) ? start : stop;
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = cbt->btree;
+
+ /*
+ * For recovery, we log the start and stop keys for a truncate
+ * operation, not the individual records removed. On the other hand,
+ * for rollback we need to keep track of all the in-memory operations.
+ *
+ * We deal with this here by logging the truncate range first, then (in
+ * the logging code) disabling writing of the in-memory remove records
+ * to disk.
+ */
+ if (S2C(session)->logging)
+ WT_RET(__wt_txn_truncate_log(session, start, stop));
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ WT_ERR(__cursor_truncate_fix(
+ session, start, stop, __cursor_col_modify));
+ break;
+ case BTREE_COL_VAR:
+ WT_ERR(__cursor_truncate(
+ session, start, stop, __cursor_col_modify));
+ break;
+ case BTREE_ROW:
+ /*
+ * The underlying cursor comparison routine requires cursors be
+ * fully instantiated when truncating row-store objects because
+ * it's comparing page and/or skiplist positions, not keys. (Key
+ * comparison would work, it's only that a key comparison would
+ * be relatively expensive. Column-store objects have record
+ * number keys, so the key comparison is cheap.) Cursors may
+ * have only had their keys set, so we must ensure the cursors
+ * are positioned in the tree.
+ */
+ if (start != NULL)
+ WT_ERR(__wt_btcur_search(start));
+ if (stop != NULL)
+ WT_ERR(__wt_btcur_search(stop));
+ WT_ERR(__cursor_truncate(
+ session, start, stop, __cursor_row_modify));
+ break;
+ }
+
+err: if (S2C(session)->logging)
+ WT_TRET(__wt_txn_truncate_end(session));
+ return (ret);
+}
+
+/*
+ * __wt_btcur_close --
+ * Close a btree cursor.
+ */
+int
+__wt_btcur_close(WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ ret = __curfile_leave(cbt);
+ __wt_buf_free(session, &cbt->search_key);
+ __wt_buf_free(session, &cbt->tmp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
new file mode 100644
index 00000000000..ebbb335d3a8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -0,0 +1,1104 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * We pass around a session handle and output information, group it together.
+ */
+typedef struct {
+ WT_SESSION_IMPL *session; /* Enclosing session */
+
+ /*
+ * When using the standard event handlers, the debugging output has to
+ * do its own message handling because its output isn't line-oriented.
+ */
+ FILE *fp; /* Output file stream */
+ WT_ITEM *msg; /* Buffered message */
+
+ WT_ITEM *tmp; /* Temporary space */
+} WT_DBG;
+
+static const /* Output separator */
+ char * const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
+
+static int __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *);
+static int __debug_cell_data(
+ WT_DBG *, WT_PAGE *, int type, const char *, WT_CELL_UNPACK *);
+static void __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, int);
+static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
+static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *);
+static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *);
+static void __debug_item(WT_DBG *, const char *, const void *, size_t);
+static int __debug_page(WT_DBG *, WT_PAGE *, uint32_t);
+static void __debug_page_col_fix(WT_DBG *, WT_PAGE *);
+static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
+static int __debug_page_col_var(WT_DBG *, WT_PAGE *);
+static int __debug_page_metadata(WT_DBG *, WT_PAGE *);
+static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
+static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
+static int __debug_ref(WT_DBG *, WT_REF *);
+static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
+static int __debug_tree(WT_SESSION_IMPL *, WT_PAGE *, const char *, uint32_t);
+static void __debug_update(WT_DBG *, WT_UPDATE *, int);
+static void __dmsg(WT_DBG *, const char *, ...)
+ WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+static void __dmsg_wrapup(WT_DBG *);
+
+/*
+ * __wt_debug_set_verbose --
+ * Set verbose flags from the debugger.
+ */
+int
+__wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v)
+{
+ const char *cfg[2] = { NULL, NULL };
+ char buf[256];
+
+ snprintf(buf, sizeof(buf), "verbose=[%s]", v);
+ cfg[0] = buf;
+ return (__wt_verbose_config(session, cfg));
+}
+
+/*
+ * __debug_hex_byte --
+ * Output a single byte in hex.
+ */
+static inline void
+__debug_hex_byte(WT_DBG *ds, uint8_t v)
+{
+ static const char hex[] = "0123456789abcdef";
+
+ __dmsg(ds, "#%c%c", hex[(v & 0xf0) >> 4], hex[v & 0x0f]);
+}
+
+/*
+ * __debug_config --
+ * Configure debugging output.
+ */
+static int
+__debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
+{
+ memset(ds, 0, sizeof(WT_DBG));
+
+ ds->session = session;
+
+ WT_RET(__wt_scr_alloc(session, 512, &ds->tmp));
+
+ /*
+ * If we weren't given a file, we use the default event handler, and
+ * we'll have to buffer messages.
+ */
+ if (ofile == NULL)
+ return (__wt_scr_alloc(session, 512, &ds->msg));
+
+ /* If we're using a file, flush on each line. */
+ if ((ds->fp = fopen(ofile, "w")) == NULL)
+ WT_RET_MSG(session, __wt_errno(), "%s", ofile);
+
+ (void)setvbuf(ds->fp, NULL, _IOLBF, 0);
+ return (0);
+}
+
+/*
+ * __dmsg_wrapup --
+ * Flush any remaining output, release resources.
+ */
+static void
+__dmsg_wrapup(WT_DBG *ds)
+{
+ WT_SESSION_IMPL *session;
+ WT_ITEM *msg;
+
+ session = ds->session;
+ msg = ds->msg;
+
+ __wt_scr_free(&ds->tmp);
+
+ /*
+ * Discard the buffer -- it shouldn't have anything in it, but might
+ * as well be cautious.
+ */
+ if (msg != NULL) {
+ if (msg->size != 0)
+ (void)__wt_msg(session, "%s", (char *)msg->mem);
+ __wt_scr_free(&ds->msg);
+ }
+
+ /* Close any file we opened. */
+ if (ds->fp != NULL)
+ (void)fclose(ds->fp);
+}
+
+/*
+ * __dmsg --
+ * Debug message.
+ */
+static void
+__dmsg(WT_DBG *ds, const char *fmt, ...)
+{
+ va_list ap;
+ WT_ITEM *msg;
+ WT_SESSION_IMPL *session;
+ size_t len, space;
+ char *p;
+
+ session = ds->session;
+
+ /*
+ * Debug output chunks are not necessarily terminated with a newline
+ * character. It's easy if we're dumping to a stream, but if we're
+ * dumping to an event handler, which is line-oriented, we must buffer
+ * the output chunk, and pass it to the event handler once we see a
+ * terminating newline.
+ */
+ if (ds->fp == NULL) {
+ msg = ds->msg;
+ for (;;) {
+ p = (char *)msg->mem + msg->size;
+ space = msg->memsize - msg->size;
+ va_start(ap, fmt);
+ len = (size_t)vsnprintf(p, space, fmt, ap);
+ va_end(ap);
+
+ /* Check if there was enough space. */
+ if (len < space) {
+ msg->size += len;
+ break;
+ }
+
+ /*
+ * There's not much to do on error without checking for
+ * an error return on every single printf. Anyway, it's
+ * pretty unlikely and this is debugging output, I'm not
+ * going to worry about it.
+ */
+ if (__wt_buf_grow(
+ session, msg, msg->memsize + len + 128) != 0)
+ return;
+ }
+ if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') {
+ ((uint8_t *)msg->mem)[msg->size - 1] = '\0';
+ (void)__wt_msg(session, "%s", (char *)msg->mem);
+ msg->size = 0;
+ }
+ } else {
+ va_start(ap, fmt);
+ (void)vfprintf(ds->fp, fmt, ap);
+ va_end(ap);
+ }
+}
+
+/*
+ * __wt_debug_addr_print --
+ * Print out an address.
+ */
+int
+__wt_debug_addr_print(
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_DECL_ITEM(buf);
+
+ WT_RET(__wt_scr_alloc(session, 128, &buf));
+ fprintf(stderr, "%s\n",
+ __wt_addr_string(session, addr, addr_size, buf));
+ __wt_scr_free(&buf);
+
+ return (0);
+}
+
+/*
+ * __wt_debug_addr --
+ * Read and dump a disk page in debugging mode, using an addr/size pair.
+ */
+int
+__wt_debug_addr(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, const char *ofile)
+{
+ WT_BM *bm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ bm = S2BT(session)->bm;
+
+ WT_RET(__wt_scr_alloc(session, 1024, &buf));
+ WT_ERR(bm->read(bm, session, buf, addr, addr_size));
+ ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_debug_offset_blind --
+ * Read and dump a disk page in debugging mode, using a file offset.
+ */
+int
+__wt_debug_offset_blind(
+ WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ /*
+ * This routine depends on the default block manager's view of files,
+ * where an address consists of a file offset, length, and checksum.
+ * This is for debugging only. Other block managers might not see a
+ * file or address the same way, that's why there's no block manager
+ * method.
+ */
+ WT_RET(__wt_scr_alloc(session, 1024, &buf));
+ WT_ERR(__wt_block_read_off_blind(
+ session, S2BT(session)->bm->block, buf, offset));
+ ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_debug_offset --
+ * Read and dump a disk page in debugging mode, using a file
+ * offset/size/checksum triplet.
+ */
+int
+__wt_debug_offset(WT_SESSION_IMPL *session,
+ wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp;
+
+ /*
+ * This routine depends on the default block manager's view of files,
+ * where an address consists of a file offset, length, and checksum.
+ * This is for debugging only: other block managers might not see a
+ * file or address the same way, that's why there's no block manager
+ * method.
+ *
+ * Convert the triplet into an address structure.
+ */
+ endp = addr;
+ WT_RET(__wt_block_addr_to_buffer(
+ S2BT(session)->bm->block, &endp, offset, size, cksum));
+
+ /*
+ * Read the address through the btree I/O functions (so the block is
+ * decompressed as necessary).
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_bt_read(session, buf, addr, WT_PTRDIFF(endp, addr)));
+ ret = __wt_debug_disk(session, buf->mem, ofile);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_debug_disk --
+ * Dump a disk page in debugging mode.
+ */
+int
+__wt_debug_disk(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
+{
+ WT_DBG *ds, _ds;
+ WT_DECL_RET;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ __dmsg(ds, "%s page", __wt_page_type_string(dsk->type));
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ __dmsg(ds, ", recno %" PRIu64, dsk->recno);
+ /* FALLTHROUGH */
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ __dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries);
+ break;
+ case WT_PAGE_OVFL:
+ __dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ __debug_dsk_col_fix(ds, dsk);
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ ret = __debug_dsk_cell(ds, dsk);
+ break;
+ default:
+ break;
+ }
+
+ __dmsg_wrapup(ds);
+
+ return (ret);
+}
+
+/*
+ * __debug_dsk_col_fix --
+ * Dump a WT_PAGE_COL_FIX page.
+ */
+static void
+__debug_dsk_col_fix(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
+{
+ WT_BTREE *btree;
+ uint32_t i;
+ uint8_t v;
+
+ btree = S2BT(ds->session);
+
+ WT_FIX_FOREACH(btree, dsk, v, i) {
+ __dmsg(ds, "\t{");
+ __debug_hex_byte(ds, v);
+ __dmsg(ds, "}\n");
+ }
+}
+
+/*
+ * __debug_dsk_cell --
+ * Dump a page of WT_CELL's.
+ */
+static int
+__debug_dsk_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t i;
+
+ btree = S2BT(ds->session);
+ unpack = &_unpack;
+
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ WT_RET(__debug_cell(ds, dsk, unpack));
+ }
+ return (0);
+}
+
+/*
+ * __debug_shape_info --
+ * Pretty-print information about a page.
+ */
+static char *
+__debug_tree_shape_info(WT_PAGE *page)
+{
+ uint64_t v;
+ static char buf[32];
+
+ v = page->memory_footprint;
+ if (v >= WT_GIGABYTE)
+ snprintf(buf, sizeof(buf), "(%" PRIu64 "G)", v / WT_GIGABYTE);
+ else if (v >= WT_MEGABYTE)
+ snprintf(buf, sizeof(buf), "(%" PRIu64 "M)", v / WT_MEGABYTE);
+ else
+ snprintf(buf, sizeof(buf), "(%" PRIu64 ")", v);
+ return (buf);
+}
+
+/*
+ * __debug_tree_shape_worker --
+ * Dump information about the current page and descend.
+ */
+static void
+__debug_tree_shape_worker(WT_DBG *ds, WT_PAGE *page, int level)
+{
+ WT_REF *ref;
+ WT_SESSION_IMPL *session;
+
+ session = ds->session;
+
+ if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) {
+ __dmsg(ds, "%*s" "I" "%s\n",
+ level, " ", __debug_tree_shape_info(page));
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ if (ref->state == WT_REF_MEM)
+ __debug_tree_shape_worker(
+ ds, ref->page, level + 3);
+ } WT_INTL_FOREACH_END;
+ } else
+ __dmsg(ds, "%*s" "L" "%s\n",
+ level, " ", __debug_tree_shape_info(page));
+}
+
+/*
+ * __wt_debug_tree_shape --
+ * Dump the shape of the in-memory tree.
+ */
+int
+__wt_debug_tree_shape(
+ WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ WT_DBG *ds, _ds;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ /* A NULL page starts at the top of the tree -- it's a convenience. */
+ if (page == NULL)
+ page = S2BT(session)->root.page;
+
+ __debug_tree_shape_worker(ds, page, 0);
+
+ __dmsg_wrapup(ds);
+ return (0);
+}
+
+#define WT_DEBUG_TREE_LEAF 0x01 /* Debug leaf pages */
+#define WT_DEBUG_TREE_WALK 0x02 /* Descend the tree */
+
+/*
+ * __wt_debug_tree_all --
+ * Dump the in-memory information for a tree, including leaf pages.
+ */
+int
+__wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ return (__debug_tree(
+ session, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
+}
+
+/*
+ * __wt_debug_tree --
+ * Dump the in-memory information for a tree, not including leaf pages.
+ */
+int
+__wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ return (__debug_tree(session, page, ofile, WT_DEBUG_TREE_WALK));
+}
+
+/*
+ * __wt_debug_page --
+ * Dump the in-memory information for a page.
+ */
+int
+__wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+{
+ WT_DBG *ds, _ds;
+ WT_DECL_RET;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ ret = __debug_page(ds, page, WT_DEBUG_TREE_LEAF);
+
+ __dmsg_wrapup(ds);
+
+ return (ret);
+}
+
+/*
+ * __debug_tree --
+ * Dump the in-memory information for a tree.
+ */
+static int
+__debug_tree(
+ WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile, uint32_t flags)
+{
+ WT_DBG *ds, _ds;
+ WT_DECL_RET;
+
+ ds = &_ds;
+ WT_RET(__debug_config(session, ds, ofile));
+
+ /* A NULL page starts at the top of the tree -- it's a convenience. */
+ if (page == NULL)
+ page = S2BT(session)->root.page;
+
+ ret = __debug_page(ds, page, flags);
+
+ __dmsg_wrapup(ds);
+
+ return (ret);
+}
+
+/*
+ * __debug_page --
+ * Dump the in-memory information for an in-memory page.
+ */
+static int
+__debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+ WT_SESSION_IMPL *session;
+
+ session = ds->session;
+
+ /* Dump the page metadata. */
+ WT_RET(__debug_page_metadata(ds, page));
+
+ /* Dump the page. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+ __debug_page_col_fix(ds, page);
+ break;
+ case WT_PAGE_COL_INT:
+ WT_RET(__debug_page_col_int(ds, page, flags));
+ break;
+ case WT_PAGE_COL_VAR:
+ if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+ WT_RET(__debug_page_col_var(ds, page));
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_RET(__debug_page_row_int(ds, page, flags));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (LF_ISSET(WT_DEBUG_TREE_LEAF))
+ WT_RET(__debug_page_row_leaf(ds, page));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * __debug_page_metadata --
+ * Dump an in-memory page's metadata.
+ */
+static int
+__debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_PAGE_INDEX *pindex;
+ WT_PAGE_MODIFY *mod;
+ WT_SESSION_IMPL *session;
+ uint32_t entries;
+
+ session = ds->session;
+ mod = page->modify;
+
+ __dmsg(ds, "%p", page);
+
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno);
+ pindex = WT_INTL_INDEX_COPY(page);
+ entries = pindex->entries;
+ break;
+ case WT_PAGE_COL_FIX:
+ __dmsg(ds, " recno %" PRIu64, page->pg_fix_recno);
+ entries = page->pg_fix_entries;
+ break;
+ case WT_PAGE_COL_VAR:
+ __dmsg(ds, " recno %" PRIu64, page->pg_var_recno);
+ entries = page->pg_var_entries;
+ break;
+ case WT_PAGE_ROW_INT:
+ pindex = WT_INTL_INDEX_COPY(page);
+ entries = pindex->entries;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ entries = page->pg_row_entries;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ __dmsg(ds, ": %s\n", __wt_page_type_string(page->type));
+ __dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries);
+ __dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
+ __dmsg(ds, ", keys-built");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+ __dmsg(ds, ", disk-alloc");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+ __dmsg(ds, ", disk-mapped");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ __dmsg(ds, ", evict-lru");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
+ __dmsg(ds, ", scanning");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING))
+ __dmsg(ds, ", splitting");
+
+ if (mod != NULL)
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY:
+ __dmsg(ds, ", empty");
+ break;
+ case WT_PM_REC_MULTIBLOCK:
+ __dmsg(ds, ", multiblock");
+ break;
+ case WT_PM_REC_REPLACE:
+ __dmsg(ds, ", replaced");
+ break;
+ case 0:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ if (mod != NULL)
+ __dmsg(ds, ", write generation=%" PRIu32, mod->write_gen);
+ __dmsg(ds, "\n");
+
+ return (0);
+}
+
+/*
+ * __debug_page_col_fix --
+ * Dump an in-memory WT_PAGE_COL_FIX page.
+ */
+static void
+__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_INSERT *ins;
+ const WT_PAGE_HEADER *dsk;
+ WT_SESSION_IMPL *session;
+ uint64_t recno;
+ uint32_t i;
+ uint8_t v;
+
+ session = ds->session;
+ btree = S2BT(session);
+ dsk = page->dsk;
+ recno = page->pg_fix_recno;
+
+ if (dsk != NULL) {
+ ins = WT_SKIP_FIRST(WT_COL_UPDATE_SINGLE(page));
+ WT_FIX_FOREACH(btree, dsk, v, i) {
+ __dmsg(ds, "\t%" PRIu64 "\t{", recno);
+ __debug_hex_byte(ds, v);
+ __dmsg(ds, "}\n");
+
+ /* Check for a match on the update list. */
+ if (ins != NULL && WT_INSERT_RECNO(ins) == recno) {
+ __dmsg(ds,
+ "\tupdate %" PRIu64 "\n",
+ WT_INSERT_RECNO(ins));
+ __debug_update(ds, ins->upd, 1);
+ ins = WT_SKIP_NEXT(ins);
+ }
+ ++recno;
+ }
+ }
+
+ if (WT_COL_UPDATE_SINGLE(page) != NULL) {
+ __dmsg(ds, "%s", sep);
+ __debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", 1);
+ }
+ if (WT_COL_APPEND(page) != NULL) {
+ __dmsg(ds, "%s", sep);
+ __debug_col_skip(ds, WT_COL_APPEND(page), "append", 1);
+ }
+}
+
+/*
+ * __debug_page_col_int --
+ * Dump an in-memory WT_PAGE_COL_INT page.
+ */
+static int
+__debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+ WT_REF *ref;
+ WT_SESSION_IMPL *session;
+
+ session = ds->session;
+
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno);
+ WT_RET(__debug_ref(ds, ref));
+ } WT_INTL_FOREACH_END;
+
+ if (LF_ISSET(WT_DEBUG_TREE_WALK))
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ if (ref->state == WT_REF_MEM) {
+ __dmsg(ds, "\n");
+ WT_RET(__debug_page(ds, ref->page, flags));
+ }
+ } WT_INTL_FOREACH_END;
+
+ return (0);
+}
+
+/*
+ * __debug_page_col_var --
+ * Dump an in-memory WT_PAGE_COL_VAR page.
+ */
+static int
+__debug_page_col_var(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COL *cip;
+ WT_INSERT_HEAD *update;
+ uint64_t recno, rle;
+ uint32_t i;
+ char tag[64];
+
+ unpack = &_unpack;
+ recno = page->pg_var_recno;
+
+ WT_COL_FOREACH(page, cip, i) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ unpack = NULL;
+ rle = 1;
+ } else {
+ __wt_cell_unpack(cell, unpack);
+ rle = __wt_cell_rle(unpack);
+ }
+ snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle);
+ WT_RET(
+ __debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack));
+
+ if ((update = WT_COL_UPDATE(page, cip)) != NULL)
+ __debug_col_skip(ds, update, "update", 0);
+ recno += rle;
+ }
+
+ if (WT_COL_APPEND(page) != NULL) {
+ __dmsg(ds, "%s", sep);
+ __debug_col_skip(ds, WT_COL_APPEND(page), "append", 0);
+ }
+
+ return (0);
+}
+
+/*
+ * __debug_page_row_int --
+ * Dump an in-memory WT_PAGE_ROW_INT page.
+ */
+static int
+__debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+{
+ WT_REF *ref;
+ WT_SESSION_IMPL *session;
+ size_t len;
+ uint8_t *p;
+
+ session = ds->session;
+
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &p, &len);
+ __debug_item(ds, "K", p, len);
+ WT_RET(__debug_ref(ds, ref));
+ } WT_INTL_FOREACH_END;
+
+ if (LF_ISSET(WT_DEBUG_TREE_WALK))
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ if (ref->state == WT_REF_MEM) {
+ __dmsg(ds, "\n");
+ WT_RET(__debug_page(ds, ref->page, flags));
+ }
+ } WT_INTL_FOREACH_END;
+ return (0);
+}
+
+/*
+ * __debug_page_row_leaf --
+ * Dump an in-memory WT_PAGE_ROW_LEAF page.
+ */
+static int
+__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_INSERT_HEAD *insert;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+ WT_UPDATE *upd;
+ uint32_t i;
+
+ session = ds->session;
+ unpack = &_unpack;
+ WT_RET(__wt_scr_alloc(session, 256, &key));
+
+ /*
+ * Dump any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
+ __debug_row_skip(ds, insert);
+
+ /* Dump the page's K/V pairs. */
+ WT_ROW_FOREACH(page, rip, i) {
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+ __debug_item(ds, "K", key->data, key->size);
+
+ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+ __dmsg(ds, "\tV {}\n");
+ else {
+ __wt_cell_unpack(cell, unpack);
+ WT_ERR(__debug_cell_data(
+ ds, page, WT_PAGE_ROW_LEAF, "V", unpack));
+ }
+
+ if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
+ __debug_update(ds, upd, 0);
+
+ if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
+ __debug_row_skip(ds, insert);
+ }
+
+err: __wt_scr_free(&key);
+ return (ret);
+}
+
+/*
+ * __debug_col_skip --
+ * Dump a column-store skiplist.
+ */
+static void
+__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, int hexbyte)
+{
+ WT_INSERT *ins;
+
+ WT_SKIP_FOREACH(ins, head) {
+ __dmsg(ds,
+ "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins));
+ __debug_update(ds, ins->upd, hexbyte);
+ }
+}
+
+/*
+ * __debug_row_skip --
+ * Dump an insert list.
+ */
+static void
+__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
+{
+ WT_INSERT *ins;
+
+ WT_SKIP_FOREACH(ins, head) {
+ __debug_item(ds,
+ "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins));
+ __debug_update(ds, ins->upd, 0);
+ }
+}
+
+/*
+ * __debug_update --
+ * Dump an update list.
+ */
+static void
+__debug_update(WT_DBG *ds, WT_UPDATE *upd, int hexbyte)
+{
+ for (; upd != NULL; upd = upd->next)
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ __dmsg(ds, "\tvalue {deleted}\n");
+ else if (hexbyte) {
+ __dmsg(ds, "\t{");
+ __debug_hex_byte(ds,
+ ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ __dmsg(ds, "}\n");
+ } else
+ __debug_item(ds,
+ "value", WT_UPDATE_DATA(upd), upd->size);
+}
+
+/*
+ * __debug_ref --
+ * Dump a WT_REF structure.
+ */
+static int
+__debug_ref(WT_DBG *ds, WT_REF *ref)
+{
+ WT_SESSION_IMPL *session;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ session = ds->session;
+
+ __dmsg(ds, "\t");
+ switch (ref->state) {
+ case WT_REF_DISK:
+ __dmsg(ds, "disk");
+ break;
+ case WT_REF_DELETED:
+ __dmsg(ds, "deleted");
+ break;
+ case WT_REF_LOCKED:
+ __dmsg(ds, "locked %p", ref->page);
+ break;
+ case WT_REF_MEM:
+ __dmsg(ds, "memory %p", ref->page);
+ break;
+ case WT_REF_READING:
+ __dmsg(ds, "reading");
+ break;
+ case WT_REF_SPLIT:
+ __dmsg(ds, "split");
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ __dmsg(ds, " %s\n",
+ __wt_addr_string(session, addr, addr_size, ds->tmp));
+
+ return (0);
+}
+
+/*
+ * __debug_cell --
+ * Dump a single unpacked WT_CELL.
+ */
+static int
+__debug_cell(WT_DBG *ds, const WT_PAGE_HEADER *dsk, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *type;
+
+ session = ds->session;
+
+ __dmsg(ds, "\t%s: len %" PRIu32,
+ __wt_cell_type_string(unpack->raw), unpack->size);
+
+ /* Dump cell's per-disk page type information. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ switch (unpack->type) {
+ case WT_CELL_VALUE:
+ __dmsg(ds, ", recno: %" PRIu64, unpack->v);
+ break;
+ }
+ break;
+ case WT_PAGE_COL_VAR:
+ switch (unpack->type) {
+ case WT_CELL_DEL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ __dmsg(ds, ", rle: %" PRIu64, __wt_cell_rle(unpack));
+ break;
+ }
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ __dmsg(ds, ", pfx: %" PRIu8, unpack->prefix);
+ break;
+ }
+ break;
+ }
+
+ /* Dump addresses. */
+ switch (unpack->raw) {
+ case WT_CELL_ADDR_DEL:
+ type = "addr/del";
+ goto addr;
+ case WT_CELL_ADDR_INT:
+ type = "addr/int";
+ goto addr;
+ case WT_CELL_ADDR_LEAF:
+ type = "addr/leaf";
+ goto addr;
+ case WT_CELL_ADDR_LEAF_NO:
+ type = "addr/leaf-no";
+ goto addr;
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ type = "ovfl";
+addr: WT_RET(__wt_scr_alloc(session, 128, &buf));
+ __dmsg(ds, ", %s %s", type,
+ __wt_addr_string(session, unpack->data, unpack->size, buf));
+ __wt_scr_free(&buf);
+ WT_RET(ret);
+ break;
+ }
+ __dmsg(ds, "\n");
+
+ return (__debug_cell_data(ds, NULL, dsk->type, NULL, unpack));
+}
+
+/*
+ * __debug_cell_data --
+ * Dump a single cell's data in debugging mode.
+ */
+static int
+__debug_cell_data(WT_DBG *ds,
+ WT_PAGE *page, int page_type, const char *tag, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *p;
+
+ session = ds->session;
+
+ /*
+ * Column-store references to deleted cells return a NULL cell
+ * reference.
+ */
+ if (unpack == NULL) {
+ __debug_item(ds, tag, "deleted", strlen("deleted"));
+ return (0);
+ }
+
+ switch (unpack->raw) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_DEL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL_RM:
+ p = __wt_cell_type_string(unpack->raw);
+ __debug_item(ds, tag, p, strlen(p));
+ break;
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_KEY_SHORT_PFX:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_COPY:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_SHORT:
+ WT_RET(__wt_scr_alloc(session, 256, &buf));
+ ret = page == NULL ?
+ __wt_dsk_cell_data_ref(session, page_type, unpack, buf) :
+ __wt_page_cell_data_ref(session, page, unpack, buf);
+ if (ret == 0)
+ __debug_item(ds, tag, buf->data, buf->size);
+ __wt_scr_free(&buf);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (ret);
+}
+
+/*
+ * __debug_item --
+ * Dump a single data/size pair, with an optional tag.
+ */
+static void
+__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size)
+{
+ size_t i;
+ int ch;
+ const uint8_t *data;
+
+ __dmsg(ds, "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ");
+ for (data = data_arg, i = 0; i < size; ++i, ++data) {
+ ch = data[0];
+ if (isprint(ch))
+ __dmsg(ds, "%c", ch);
+ else
+ __debug_hex_byte(ds, data[0]);
+ }
+ __dmsg(ds, "}\n");
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
new file mode 100644
index 00000000000..2fc1b0d5460
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -0,0 +1,339 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Fast-delete support.
+ *
+ * This file contains most of the code that allows WiredTiger to delete pages
+ * of data without reading them into the cache. (This feature is currently
+ * only available for row-store objects.)
+ *
+ * The way cursor truncate works in a row-store object is it explicitly reads
+ * the first and last pages of the truncate range, then walks the tree with a
+ * flag so the cursor walk code marks any page within the range, that hasn't
+ * yet been read and which has no overflow items, as deleted, by changing the
+ * WT_REF state to WT_REF_DELETED. Pages already in the cache or with overflow
+ * items, have their rows updated/deleted individually. The transaction for the
+ * delete operation is stored in memory referenced by the WT_REF.page_del field.
+ *
+ * Future cursor walks of the tree will skip the deleted page based on the
+ * transaction stored for the delete, but it gets more complicated if a read is
+ * done using a random key, or a cursor walk is done with a transaction where
+ * the delete is not visible. In those cases, we read the original contents of
+ * the page. The page-read code notices a deleted page is being read, and as
+ * part of the read instantiates the contents of the page, creating a WT_UPDATE
+ * with a deleted operation, in the same transaction as deleted the page. In
+ * other words, the read process makes it appear as if the page was read and
+ * each individual row deleted, exactly as would have happened if the page had
+ * been in the cache all along.
+ *
+ * There's an additional complication to support rollback of the page delete.
+ * When the page was marked deleted, a pointer to the WT_REF was saved in the
+ * deleting session's transaction list and the delete is unrolled by resetting
+ * the WT_REF_DELETED state back to WT_REF_DISK. However, if the page has been
+ * instantiated by some reading thread, that's not enough, each individual row
+ * on the page must have the delete operation reset. If the page split, the
+ * WT_UPDATE lists might have been saved/restored during reconciliation and
+ * appear on multiple pages, and the WT_REF stored in the deleting session's
+ * transaction list is no longer useful. For this reason, when the page is
+ * instantiated by a read, a list of the WT_UPDATE structures on the page is
+ * stored in the WT_REF.page_del field, with the transaction ID, that way the
+ * session unrolling the delete can find all of the WT_UPDATE structures that
+ * require update.
+ *
+ * One final note: pages can also be marked deleted if emptied and evicted. In
+ * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
+ * be any associated WT_REF.page_del field. These pages are always skipped
+ * during cursor traversal (the page could not have been evicted if there were
+ * updates that weren't globally visible), and if read is forced to instantiate
+ * such a page, it simply creates an empty page from scratch.
+ */
+
+/*
+ * __wt_delete_page --
+ * If deleting a range, try to delete the page without instantiating it.
+ */
+int
+__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+
+ *skipp = 0;
+
+ /*
+ * Atomically switch the page's state to lock it. If the page is not
+ * on-disk, other threads may be using it, no fast delete.
+ *
+ * Possible optimization: if the page is already deleted and the delete
+ * is visible to us (the delete has been committed), we could skip the
+ * page instead of instantiating it and figuring out there are no rows
+ * in the page. While that's a huge amount of work to no purpose, it's
+ * unclear optimizing for overlapping range deletes is worth the effort.
+ */
+ if (ref->state != WT_REF_DISK ||
+ !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED))
+ return (0);
+
+ /*
+ * We cannot fast-delete pages that have overflow key/value items as
+ * the overflow blocks have to be discarded. The way we figure that
+ * out is to check the on-page cell type for the page, cells for leaf
+ * pages that have no overflow items are special.
+ *
+ * In some cases, the reference address may not reference an on-page
+ * cell (for example, some combination of page splits), in which case
+ * we can't check the original cell value and we fail.
+ *
+ * To look at an on-page cell, we need to look at the parent page, and
+ * that's dangerous, our parent page could change without warning if
+ * the parent page were to split, deepening the tree. It's safe: the
+ * page's reference will always point to some valid page, and if we find
+ * any problems we simply fail the fast-delete optimization.
+ *
+ * !!!
+ * I doubt it's worth the effort, but we could copy the cell's type into
+ * the reference structure, and then we wouldn't need an on-page cell.
+ */
+ parent = ref->home;
+ if (__wt_off_page(parent, ref->addr) ||
+ __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
+ goto err;
+
+ /*
+ * This action dirties the parent page: mark it dirty now, there's no
+ * future reconciliation of the child leaf page that will dirty it as
+ * we write the tree.
+ */
+ WT_ERR(__wt_page_parent_modify_set(session, ref, 0));
+
+ /*
+ * Record the change in the transaction structure and set the change's
+ * transaction ID.
+ */
+ WT_ERR(__wt_calloc_def(session, 1, &ref->page_del));
+ ref->page_del->txnid = session->txn.id;
+
+ WT_ERR(__wt_txn_modify_ref(session, ref));
+
+ *skipp = 1;
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (0);
+
+err: __wt_free(session, ref->page_del);
+
+ /*
+ * Restore the page to on-disk status, we'll have to instantiate it.
+ */
+ WT_PUBLISH(ref->state, WT_REF_DISK);
+ return (ret);
+}
+
+/*
+ * __wt_delete_page_rollback --
+ * Abort pages that were deleted without being instantiated.
+ */
+void
+__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_UPDATE **upd;
+
+ /*
+ * If the page is still "deleted", it's as we left it, reset the state
+ * to on-disk and we're done. Otherwise, we expect the page is either
+ * instantiated or being instantiated. Loop because it's possible for
+ * the page to return to the deleted state if instantiation fails.
+ */
+ for (;; __wt_yield())
+ switch (ref->state) {
+ case WT_REF_DISK:
+ case WT_REF_READING:
+ WT_ASSERT(session, 0); /* Impossible, assert */
+ break;
+ case WT_REF_DELETED:
+ /*
+ * If the page is still "deleted", it's as we left it,
+ * reset the state.
+ */
+ if (WT_ATOMIC_CAS4(
+ ref->state, WT_REF_DELETED, WT_REF_DISK))
+ return;
+ break;
+ case WT_REF_LOCKED:
+ /*
+ * A possible state, the page is being instantiated.
+ */
+ break;
+ case WT_REF_MEM:
+ case WT_REF_SPLIT:
+ /*
+ * We can't use the normal read path to get a copy of
+ * the page because the session may have closed the
+ * cursor, we no longer have the reference to the tree
+ * required for a hazard pointer. We're safe because
+ * with unresolved transactions, the page isn't going
+ * anywhere.
+ *
+ * The page is in an in-memory state, walk the list of
+ * update structures and abort them.
+ */
+ for (upd =
+ ref->page_del->update_list; *upd != NULL; ++upd)
+ (*upd)->txnid = WT_TXN_ABORTED;
+
+ /*
+ * Discard the memory, the transaction can't abort
+ * twice.
+ */
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ return;
+ }
+}
+
+/*
+ * __wt_delete_page_skip --
+ * If iterating a cursor, skip deleted pages that are visible to us.
+ */
+int
+__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ int skip;
+
+ /*
+ * Deleted pages come from two sources: either it's a fast-delete as
+ * described above, or the page has been emptied by other operations
+ * and eviction deleted it.
+ *
+ * In both cases, the WT_REF state will be WT_REF_DELETED. In the case
+ * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
+ * the transaction ID of the transaction that deleted the page, and the
+ * page is visible if that transaction ID is visible. In the case of an
+ * empty page, there will be no WT_PAGE_DELETED structure and the delete
+ * is by definition visible, eviction could not have deleted the page if
+ * there were changes on it that were not globally visible.
+ *
+ * We're here because we found a WT_REF state set to WT_REF_DELETED. It
+ * is possible the page is being read into memory right now, though, and
+ * the page could switch to an in-memory state at any time. Lock down
+ * the structure, just to be safe.
+ */
+ if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ return (0);
+
+ skip = ref->page_del == NULL ||
+ __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;
+
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ return (skip);
+}
+
+/*
+ * __wt_delete_page_instantiate --
+ * Instantiate an entirely deleted row-store leaf page.
+ */
+int
+__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_DELETED *page_del;
+ WT_UPDATE **upd_array, *upd;
+ uint32_t i;
+
+ btree = S2BT(session);
+ page = ref->page;
+ page_del = ref->page_del;
+
+ /*
+ * Give the page a modify structure.
+ *
+ * If the tree is already dirty and so will be written, mark the page
+ * dirty. (We'd like to free the deleted pages, but if the handle is
+ * read-only or if the application never modifies the tree, we're not
+ * able to do so.)
+ */
+ if (btree->modified) {
+ WT_RET(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+ }
+
+ /*
+ * An operation is accessing a "deleted" page, and we're building an
+ * in-memory version of the page (making it look like all entries in
+ * the page were individually updated by a remove operation). There
+ * are two cases where we end up here:
+ *
+ * First, a running transaction used a truncate call to delete the page
+ * without reading it, in which case the page reference includes a
+ * structure with a transaction ID; the page we're building might split
+ * in the future, so we update that structure to include references to
+ * all of the update structures we create, so the transaction can abort.
+ *
+ * Second, a truncate call deleted a page and the truncate committed,
+ * but an older transaction in the system forced us to keep the old
+ * version of the page around, then we crashed and recovered, and now
+ * we're being forced to read that page.
+ *
+ * In the first case, we have a page reference structure, in the second
+ * second, we don't.
+ *
+ * Allocate the per-reference update array; in the case of instantiating
+ * a page, deleted by a running transaction that might eventually abort,
+ * we need a list of the update structures so we can do that abort. The
+ * hard case is if a page splits: the update structures might be moved
+ * to different pages, and we still have to find them all for an abort.
+ */
+
+ if (page_del != NULL)
+ WT_RET(__wt_calloc_def(
+ session, page->pg_row_entries + 1, &page_del->update_list));
+
+ /* Allocate the per-page update array. */
+ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array));
+ page->pg_row_upd = upd_array;
+
+ /*
+ * Fill in the per-reference update array with references to update
+ * structures, fill in the per-page update array with references to
+ * deleted items.
+ */
+ for (i = 0; i < page->pg_row_entries; ++i) {
+ WT_ERR(__wt_calloc_def(session, 1, &upd));
+ WT_UPDATE_DELETED_SET(upd);
+
+ if (page_del == NULL)
+ upd->txnid = WT_TXN_NONE; /* Globally visible */
+ else {
+ upd->txnid = page_del->txnid;
+ page_del->update_list[i] = upd;
+ }
+
+ upd->next = upd_array[i];
+ upd_array[i] = upd;
+ }
+
+ __wt_cache_page_inmem_incr(session, page,
+ page->pg_row_entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE)));
+
+ return (0);
+
+err: /*
+ * There's no need to free the page update structures on error, our
+ * caller will discard the page and do that work for us. We could
+ * similarly leave the per-reference update array alone because it
+ * won't ever be used by any page that's not in-memory, but cleaning
+ * it up makes sense, especially if we come back in to this function
+ * attempting to instantiate this page again.
+ */
+ if (page_del != NULL)
+ __wt_free(session, page_del->update_list);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
new file mode 100644
index 00000000000..a162e2dc841
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -0,0 +1,422 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __free_page_modify(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_col_var(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_int(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
+static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
+static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
+static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
+
+/*
+ * __wt_ref_out --
+ * Discard an in-memory page, freeing all memory associated with it.
+ */
+void
+__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ /*
+ * A version of the page-out function that allows us to make additional
+ * diagnostic checks.
+ */
+ WT_ASSERT(session, S2BT(session)->evict_ref != ref);
+
+ __wt_page_out(session, &ref->page);
+}
+
+/*
+ * __wt_page_out --
+ * Discard an in-memory page, freeing all memory associated with it.
+ */
+void
+__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+{
+ WT_PAGE *page;
+ WT_PAGE_HEADER *dsk;
+ WT_PAGE_MODIFY *mod;
+
+ /*
+ * Kill our caller's reference, do our best to catch races.
+ */
+ page = *pagep;
+ *pagep = NULL;
+
+ /*
+ * We should never discard a dirty page, the file's current eviction
+ * point or a page queued for LRU eviction.
+ */
+ WT_ASSERT(session, !__wt_page_is_modified(page));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLITTING));
+
+#ifdef HAVE_DIAGNOSTIC
+ {
+ WT_HAZARD *hp;
+ int i;
+ /*
+ * Make sure no other thread has a hazard pointer on the page we are
+ * about to discard. This is complicated by the fact that readers
+ * publish their hazard pointer before re-checking the page state, so
+ * our check can race with readers without indicating a real problem.
+ * Wait for up to a second for hazard pointers to be cleared.
+ */
+ for (hp = NULL, i = 0; i < 100; i++) {
+ if ((hp = __wt_page_hazard_check(session, page)) == NULL)
+ break;
+ __wt_sleep(0, 10000);
+ }
+ if (hp != NULL)
+ __wt_errx(session,
+ "discarded page has hazard pointer: (%p: %s, line %d)",
+ hp->page, hp->file, hp->line);
+ WT_ASSERT(session, hp == NULL);
+ }
+#endif
+
+ /*
+ * If a root page split, there may be one or more pages linked from the
+ * page; walk the list, discarding pages.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ mod = page->modify;
+ if (mod != NULL && mod->mod_root_split != NULL)
+ __wt_page_out(session, &mod->mod_root_split);
+ break;
+ }
+
+ /* Update the cache's information. */
+ __wt_cache_page_evict(session, page);
+
+ /*
+ * If discarding the page as part of process exit, the application may
+ * configure to leak the memory rather than do the work.
+ */
+ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY))
+ return;
+
+ /* Free the page modification information. */
+ if (page->modify != NULL)
+ __free_page_modify(session, page);
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ __free_page_int(session, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ __free_page_col_var(session, page);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ __free_page_row_leaf(session, page);
+ break;
+ }
+
+ /* Discard any disk image. */
+ dsk = (WT_PAGE_HEADER *)page->dsk;
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+ __wt_overwrite_and_free_len(session, dsk, dsk->mem_size);
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+ (void)__wt_mmap_discard(session, dsk, dsk->mem_size);
+
+ __wt_overwrite_and_free(session, page);
+}
+
+/*
+ * __free_page_modify --
+ * Discard the page's associated modification structures.
+ */
+static void
+__free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_INSERT_HEAD *append;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ mod = page->modify;
+
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_MULTIBLOCK:
+ /* Free list of replacement blocks. */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ __wt_free(session, multi->key.ikey);
+ break;
+ }
+ __wt_free(session, multi->skip);
+ __wt_free(session, multi->skip_dsk);
+ __wt_free(session, multi->addr.addr);
+ }
+ __wt_free(session, mod->mod_multi);
+ break;
+ case WT_PM_REC_REPLACE:
+ /*
+ * Discard any replacement address: this memory is usually moved
+ * into the parent's WT_REF, but at the root that can't happen.
+ */
+ __wt_free(session, mod->mod_replace.addr);
+ break;
+ }
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ /* Free the append array. */
+ if ((append = WT_COL_APPEND(page)) != NULL) {
+ __free_skip_list(session, WT_SKIP_FIRST(append));
+ __wt_free(session, append);
+ __wt_free(session, mod->mod_append);
+ }
+
+ /* Free the insert/update array. */
+ if (mod->mod_update != NULL)
+ __free_skip_array(session, mod->mod_update,
+ page->type ==
+ WT_PAGE_COL_FIX ? 1 : page->pg_var_entries);
+ break;
+ }
+
+ /* Free the overflow on-page, reuse and transaction-cache skiplists. */
+ __wt_ovfl_reuse_free(session, page);
+ __wt_ovfl_txnc_free(session, page);
+ __wt_ovfl_discard_free(session, page);
+
+ __wt_free(session, page->modify->ovfl_track);
+
+ __wt_free(session, page->modify);
+}
+
+/*
+ * __free_page_int --
+ * Discard a WT_PAGE_COL_INT or WT_PAGE_ROW_INT page.
+ */
+static void
+__free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ __wt_free_ref_index(session, page, WT_INTL_INDEX_COPY(page), 0);
+}
+
+/*
+ * __wt_free_ref --
+ * Discard the contents of a WT_REF structure (optionally including the
+ * pages it references).
+ */
+void
+__wt_free_ref(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages)
+{
+ WT_IKEY *ikey;
+
+ if (ref == NULL)
+ return;
+
+ /*
+ * Optionally free the referenced pages. (The path to free referenced
+ * page is used for error cleanup, no instantiated and then discarded
+ * page should have WT_REF entries with real pages. The page may have
+ * been marked dirty as well; page discard checks for that, so we mark
+ * it clean explicitly.)
+ */
+ if (free_pages && ref->page != NULL) {
+ if (ref->page->modify != NULL) {
+ ref->page->modify->write_gen = 0;
+ __wt_cache_dirty_decr(session, ref->page);
+ }
+ __wt_page_out(session, &ref->page);
+ }
+
+ /* Free any key allocation. */
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
+ __wt_free(session, ikey);
+ break;
+ }
+
+ /* Free any address allocation. */
+ if (ref->addr != NULL && __wt_off_page(page, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+
+ /* Free any page-deleted information. */
+ if (ref->page_del != NULL) {
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ }
+
+ __wt_overwrite_and_free(session, ref);
+}
+
+/*
+ * __wt_free_ref_index --
+ * Discard a page index and it's references.
+ */
+void
+__wt_free_ref_index(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages)
+{
+ uint32_t i;
+
+ if (pindex == NULL)
+ return;
+
+ for (i = 0; i < pindex->entries; ++i)
+ __wt_free_ref(session, page, pindex->index[i], free_pages);
+ __wt_free(session, pindex);
+}
+
+/*
+ * __free_page_col_var --
+ * Discard a WT_PAGE_COL_VAR page.
+ */
+static void
+__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ /* Free the RLE lookup array. */
+ __wt_free(session, page->pg_var_repeats);
+}
+
+/*
+ * __free_page_row_leaf --
+ * Discard a WT_PAGE_ROW_LEAF page.
+ */
+static void
+__free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_IKEY *ikey;
+ WT_ROW *rip;
+ uint32_t i;
+ void *copy;
+
+ /*
+ * Free the in-memory index array.
+ *
+ * For each entry, see if the key was an allocation (that is, if it
+ * points somewhere other than the original page), and if so, free
+ * the memory.
+ */
+ WT_ROW_FOREACH(page, rip, i) {
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, NULL, NULL, NULL);
+ if (ikey != NULL)
+ __wt_free(session, ikey);
+ }
+
+ /*
+ * Free the insert array.
+ *
+ * Row-store tables have one additional slot in the insert array (the
+ * insert array has an extra slot to hold keys that sort before keys
+ * found on the original page).
+ */
+ if (page->pg_row_ins != NULL)
+ __free_skip_array(
+ session, page->pg_row_ins, page->pg_row_entries + 1);
+
+ /* Free the update array. */
+ if (page->pg_row_upd != NULL)
+ __free_update(session, page->pg_row_upd, page->pg_row_entries);
+}
+
+/*
+ * __free_skip_array --
+ * Discard an array of skip list headers.
+ */
+static void
+__free_skip_array(
+ WT_SESSION_IMPL *session, WT_INSERT_HEAD **head_arg, uint32_t entries)
+{
+ WT_INSERT_HEAD **head;
+
+ /*
+ * For each non-NULL slot in the page's array of inserts, free the
+ * linked list anchored in that slot.
+ */
+ for (head = head_arg; entries > 0; --entries, ++head)
+ if (*head != NULL) {
+ __free_skip_list(session, WT_SKIP_FIRST(*head));
+ __wt_free(session, *head);
+ }
+
+ /* Free the header array. */
+ __wt_free(session, head_arg);
+}
+
+/*
+ * __free_skip_list --
+ * Walk a WT_INSERT forward-linked list and free the per-thread combination
+ * of a WT_INSERT structure and its associated chain of WT_UPDATE structures.
+ */
+static void
+__free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
+{
+ WT_INSERT *next;
+
+ for (; ins != NULL; ins = next) {
+ __free_update_list(session, ins->upd);
+ next = WT_SKIP_NEXT(ins);
+ __wt_free(session, ins);
+ }
+}
+
+/*
+ * __free_update --
+ * Discard the update array.
+ */
+static void
+__free_update(
+ WT_SESSION_IMPL *session, WT_UPDATE **update_head, uint32_t entries)
+{
+ WT_UPDATE **updp;
+
+ /*
+ * For each non-NULL slot in the page's array of updates, free the
+ * linked list anchored in that slot.
+ */
+ for (updp = update_head; entries > 0; --entries, ++updp)
+ if (*updp != NULL)
+ __free_update_list(session, *updp);
+
+ /* Free the update array. */
+ __wt_free(session, update_head);
+}
+
+/*
+ * __free_update_list --
+ * Walk a WT_UPDATE forward-linked list and free the per-thread combination
+ * of a WT_UPDATE structure and its associated data.
+ */
+static void
+__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_UPDATE *next;
+
+ for (; upd != NULL; upd = next) {
+ /* Everything we free should be visible to everyone. */
+ WT_ASSERT(session,
+ F_ISSET(session, WT_SESSION_DISCARD_FORCE) ||
+ upd->txnid == WT_TXN_ABORTED ||
+ __wt_txn_visible_all(session, upd->txnid));
+
+ next = upd->next;
+ __wt_free(session, upd);
+ }
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_evict.c b/src/third_party/wiredtiger/src/btree/bt_evict.c
new file mode 100644
index 00000000000..ff049553c7f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_evict.c
@@ -0,0 +1,1297 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __evict_clear_walks(WT_SESSION_IMPL *);
+static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *);
+static int __evict_lru(WT_SESSION_IMPL *, uint32_t);
+static int __evict_lru_cmp(const void *, const void *);
+static int __evict_lru_pages(WT_SESSION_IMPL *, int);
+static int __evict_pass(WT_SESSION_IMPL *);
+static int __evict_walk(WT_SESSION_IMPL *, uint32_t *, uint32_t);
+static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t);
+static void *__evict_worker(void *);
+
+/*
+ * __evict_read_gen --
+ * Get the adjusted read generation for an eviction entry.
+ */
+static inline uint64_t
+__evict_read_gen(const WT_EVICT_ENTRY *entry)
+{
+ WT_PAGE *page;
+ uint64_t read_gen;
+
+ /* Never prioritize empty slots. */
+ if (entry->ref == NULL)
+ return (UINT64_MAX);
+
+ page = entry->ref->page;
+ read_gen = page->read_gen + entry->btree->evict_priority;
+
+ /*
+ * Skew the read generation for internal pages, we prefer to evict leaf
+ * pages.
+ */
+ if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT)
+ read_gen += WT_EVICT_INT_SKEW;
+
+ return (read_gen);
+}
+
+/*
+ * __evict_lru_cmp --
+ * Qsort function: sort the eviction array.
+ */
+static int
+__evict_lru_cmp(const void *a, const void *b)
+{
+ uint64_t a_lru, b_lru;
+
+ a_lru = __evict_read_gen(a);
+ b_lru = __evict_read_gen(b);
+
+ return ((a_lru < b_lru) ? -1 : (a_lru == b_lru) ? 0 : 1);
+}
+
+/*
+ * __evict_list_clear --
+ * Clear an entry in the LRU eviction list.
+ */
+static inline void
+__evict_list_clear(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *e)
+{
+ if (e->ref != NULL) {
+ WT_ASSERT(session,
+ F_ISSET_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU));
+ F_CLR_ATOMIC(e->ref->page, WT_PAGE_EVICT_LRU);
+ }
+ e->ref = NULL;
+ e->btree = WT_DEBUG_POINT;
+}
+
+/*
+ * __wt_evict_list_clear_page --
+ * Make sure a page is not in the LRU eviction list. This called from the
+ * page eviction code to make sure there is no attempt to evict a child
+ * page multiple times.
+ */
+void
+__wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ uint32_t i, elem;
+
+ WT_ASSERT(session,
+ __wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);
+
+ /* Fast path: if the page isn't on the queue, don't bother searching. */
+ if (!F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU))
+ return;
+
+ cache = S2C(session)->cache;
+ __wt_spin_lock(session, &cache->evict_lock);
+
+ elem = cache->evict_max;
+ for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+ if (evict->ref == ref) {
+ __evict_list_clear(session, evict);
+ break;
+ }
+
+ WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+
+ __wt_spin_unlock(session, &cache->evict_lock);
+}
+
+/*
+ * __wt_evict_server_wake --
+ * Wake the eviction server thread.
+ */
+int
+__wt_evict_server_wake(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_EVICTSERVER)) {
+ uint64_t bytes_inuse, bytes_max;
+
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ bytes_max = conn->cache_size;
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "waking, bytes inuse %s max (%" PRIu64
+ "MB %s %" PRIu64 "MB)",
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ bytes_inuse / WT_MEGABYTE,
+ bytes_inuse <= bytes_max ? "<=" : ">",
+ bytes_max / WT_MEGABYTE));
+ }
+
+ return (__wt_cond_signal(session, cache->evict_cond));
+}
+
+/*
+ * __evict_server --
+ * Thread to evict pages from the cache.
+ */
+static void *
+__evict_server(void *arg)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_WORKER *worker;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+ cache = conn->cache;
+
+ while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
+ /* Evict pages from the cache as needed. */
+ WT_ERR(__evict_pass(session));
+
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ break;
+
+ /*
+ * If we have caught up and there are more than the minimum
+ * number of eviction workers running, shut one down.
+ */
+ if (conn->evict_workers > conn->evict_workers_min) {
+ WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "Stopping evict worker: %"PRIu32"\n",
+ conn->evict_workers));
+ worker = &conn->evict_workctx[--conn->evict_workers];
+ F_CLR(worker, WT_EVICT_WORKER_RUN);
+ WT_TRET(__wt_cond_signal(
+ session, cache->evict_waiter_cond));
+ WT_TRET(__wt_thread_join(session, worker->tid));
+ /*
+ * Flag errors here with a message, but don't shut down
+ * the eviction server - that's fatal.
+ */
+ WT_ASSERT(session, ret == 0);
+ if (ret != 0) {
+ (void)__wt_msg(session,
+ "Error stopping eviction worker: %d", ret);
+ ret = 0;
+ }
+ }
+ F_CLR(cache, WT_EVICT_ACTIVE);
+ WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"));
+ /* Don't rely on signals: check periodically. */
+ WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000));
+ WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking"));
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "exiting"));
+
+err:
+ if (ret != 0) {
+ WT_PANIC_MSG(session, ret, "eviction server error");
+ return (NULL);
+ }
+
+ if (cache->pages_inmem != cache->pages_evict)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64 " pages in "
+ "memory and %" PRIu64 " pages evicted",
+ cache->pages_inmem, cache->pages_evict);
+ if (cache->bytes_inmem != cache->bytes_evict)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64 " bytes in "
+ "memory and %" PRIu64 " bytes evicted",
+ cache->bytes_inmem, cache->bytes_evict);
+ if (cache->bytes_dirty != 0 || cache->pages_dirty != 0)
+ __wt_errx(session,
+ "cache server: exiting with %" PRIu64
+ " bytes dirty and %" PRIu64 " pages dirty",
+ cache->bytes_dirty, cache->pages_dirty);
+
+ return (NULL);
+}
+
+/*
+ * __wt_evict_create --
+ * Start the eviction server thread.
+ */
+int
+__wt_evict_create(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_EVICT_WORKER *workers;
+ u_int i;
+
+ conn = S2C(session);
+
+ /* Set first, the thread might run before we finish up. */
+ F_SET(conn, WT_CONN_EVICTION_RUN);
+
+ /* We need a session handle because we're reading/writing pages. */
+ WT_RET(__wt_open_internal_session(
+ conn, "eviction-server", 0, 0, &conn->evict_session));
+ session = conn->evict_session;
+
+ /*
+ * If there's only a single eviction thread, it may be called upon to
+ * perform slow operations for the block manager. (The flag is not
+ * reset if reconfigured later, but I doubt that's a problem.)
+ */
+ if (conn->evict_workers_max == 0)
+ F_SET(session, WT_SESSION_CAN_WAIT);
+
+ if (conn->evict_workers_max > 0) {
+ WT_RET(__wt_calloc_def(
+ session, conn->evict_workers_max, &workers));
+ conn->evict_workctx = workers;
+
+ for (i = 0; i < conn->evict_workers_max; i++) {
+ WT_RET(__wt_open_internal_session(conn,
+ "eviction-worker", 0, 0, &workers[i].session));
+ workers[i].id = i;
+ F_SET(workers[i].session, WT_SESSION_CAN_WAIT);
+
+ if (i < conn->evict_workers_min) {
+ ++conn->evict_workers;
+ F_SET(&workers[i], WT_EVICT_WORKER_RUN);
+ WT_RET(__wt_thread_create(
+ workers[i].session, &workers[i].tid,
+ __evict_worker, &workers[i]));
+ }
+ }
+ }
+
+ /*
+ * Start the primary eviction server thread after the worker threads
+ * have started to avoid it starting additional worker threads before
+ * the worker's sessions are created.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->evict_tid, __evict_server, session));
+ conn->evict_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_evict_destroy --
+ * Destroy the eviction server thread.
+ */
+int
+__wt_evict_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_WORKER *workers;
+ WT_SESSION *wt_session;
+ u_int i;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ workers = conn->evict_workctx;
+
+ F_CLR(conn, WT_CONN_EVICTION_RUN);
+
+ WT_TRET(__wt_verbose(
+ session, WT_VERB_EVICTSERVER, "waiting for helper threads"));
+ for (i = 0; i < conn->evict_workers; i++) {
+ WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond));
+ WT_TRET(__wt_thread_join(session, workers[i].tid));
+ }
+ /* Handle shutdown when cleaning up after a failed open */
+ if (conn->evict_workctx != NULL) {
+ for (i = 0; i < conn->evict_workers_max; i++) {
+ wt_session = &conn->evict_workctx[i].session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+ __wt_free(session, conn->evict_workctx);
+ }
+
+ if (conn->evict_tid_set) {
+ WT_TRET(__wt_evict_server_wake(session));
+ WT_TRET(__wt_thread_join(session, conn->evict_tid));
+ conn->evict_tid_set = 0;
+ }
+
+ if (conn->evict_session != NULL) {
+ wt_session = &conn->evict_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ conn->evict_session = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __evict_worker --
+ * Thread to help evict pages from the cache.
+ */
+static void *
+__evict_worker(void *arg)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_EVICT_WORKER *worker;
+ WT_SESSION_IMPL *session;
+ uint32_t flags;
+
+ worker = arg;
+ session = worker->session;
+ conn = S2C(session);
+ cache = conn->cache;
+
+ while (F_ISSET(conn, WT_CONN_EVICTION_RUN) &&
+ F_ISSET(worker, WT_EVICT_WORKER_RUN)) {
+ /* Don't spin in a busy loop if there is no work to do */
+ WT_ERR(__evict_has_work(session, &flags));
+ if (flags == 0)
+ WT_ERR(__wt_cond_wait(
+ session, cache->evict_waiter_cond, 10000));
+ else
+ WT_ERR(__evict_lru_pages(session, 1));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "cache eviction helper error");
+ }
+
+ WT_TRET(__wt_verbose(session, WT_VERB_EVICTSERVER, "helper exiting"));
+
+ return (NULL);
+}
+
+/*
+ * __evict_has_work --
+ * Find out if there is eviction work to be done.
+ */
+static int
+__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ uint32_t flags;
+ uint64_t bytes_inuse, bytes_max, dirty_inuse;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ flags = 0;
+ *flagsp = 0;
+
+ if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
+ return (0);
+
+ /*
+ * Figure out whether the cache usage exceeds either the eviction
+ * target or the dirty target.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ dirty_inuse = cache->bytes_dirty;
+ bytes_max = conn->cache_size;
+
+ /* Check to see if the eviction server should run. */
+ if (bytes_inuse > (cache->eviction_target * bytes_max) / 100)
+ LF_SET(WT_EVICT_PASS_ALL);
+ else if (dirty_inuse >
+ (cache->eviction_dirty_target * bytes_max) / 100)
+ /* Ignore clean pages unless the cache is too large */
+ LF_SET(WT_EVICT_PASS_DIRTY);
+
+ if (F_ISSET(cache, WT_EVICT_STUCK))
+ LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+
+ *flagsp = flags;
+ return (0);
+}
+
+/*
+ * __evict_pass --
+ * Evict pages from memory.
+ */
+static int
+__evict_pass(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_EVICT_WORKER *worker;
+ int loop;
+ uint32_t flags;
+ uint64_t bytes_inuse;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /* Evict pages from the cache. */
+ for (loop = 0;; loop++) {
+ /*
+ * If there is a request to clear eviction walks, do that now,
+ * before checking if the cache is full.
+ */
+ if (F_ISSET(cache, WT_EVICT_CLEAR_WALKS)) {
+ F_CLR(cache, WT_EVICT_CLEAR_WALKS);
+ WT_RET(__evict_clear_walks(session));
+ WT_RET(__wt_cond_signal(
+ session, cache->evict_waiter_cond));
+ }
+
+ WT_RET(__evict_has_work(session, &flags));
+ if (flags == 0)
+ break;
+
+ if (loop > 10)
+ LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ /*
+ * When the cache is full, track whether pages are being
+ * evicted. This will be cleared by the next thread to
+ * successfully evict a page.
+ */
+ if (bytes_inuse > conn->cache_size) {
+ F_SET(cache, WT_EVICT_NO_PROGRESS);
+ } else
+ F_CLR(cache, WT_EVICT_NO_PROGRESS);
+
+ /* Start a worker if we have capacity and the cache is full. */
+ if (bytes_inuse > conn->cache_size &&
+ conn->evict_workers < conn->evict_workers_max) {
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "Starting evict worker: %"PRIu32"\n",
+ conn->evict_workers));
+ worker = &conn->evict_workctx[conn->evict_workers++];
+ F_SET(worker, WT_EVICT_WORKER_RUN);
+ WT_RET(__wt_thread_create(session,
+ &worker->tid, __evict_worker, worker));
+ }
+
+ F_SET(cache, WT_EVICT_ACTIVE);
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "Eviction pass with: Max: %" PRIu64
+ " In use: %" PRIu64 " Dirty: %" PRIu64,
+ conn->cache_size, bytes_inuse, cache->bytes_dirty));
+
+ WT_RET(__evict_lru(session, flags));
+
+ /*
+ * If we're making progress, keep going; if we're not making
+ * any progress at all, mark the cache "stuck" and go back to
+ * sleep, it's not something we can fix.
+ */
+ if (F_ISSET(cache, WT_EVICT_NO_PROGRESS)) {
+ if (F_ISSET(cache, WT_EVICT_STUCK))
+ break;
+ if (loop == 100) {
+ F_SET(cache, WT_EVICT_STUCK);
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_slow);
+ WT_RET(__wt_verbose(
+ session, WT_VERB_EVICTSERVER,
+ "unable to reach eviction goal"));
+ break;
+ }
+ } else
+ loop = 0;
+ }
+ return (0);
+}
+
+/*
+ * __evict_clear_walks --
+ * Clear the eviction walk points for all files.
+ */
+static int
+__evict_clear_walks(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_REF *ref;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ cache->evict_file_next = NULL;
+
+ /*
+ * Lock the dhandle list so sweeping cannot change the pointers out
+ * from under us.
+ *
+ * NOTE: we don't hold the schema lock, so we have to take care
+ * that the handles we see are open and valid.
+ */
+ __wt_spin_lock(session, &conn->dhandle_lock);
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ /* Ignore non-file handles, or handles that aren't open. */
+ if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ continue;
+
+ btree = dhandle->handle;
+ session->dhandle = dhandle;
+ if ((ref = btree->evict_ref) != NULL) {
+ /*
+ * Clear evict_ref first, in case releasing it forces
+ * eviction (we assert that we never try to evict the
+ * current eviction walk point).
+ */
+ btree->evict_ref = NULL;
+ WT_TRET(__wt_page_release(session, ref, 0));
+ }
+ session->dhandle = NULL;
+ }
+
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+
+ return (ret);
+}
+
+/*
+ * __evict_tree_walk_clear --
+ * Clear the tree's current eviction point, acquiring the eviction lock.
+ */
+static int
+__evict_tree_walk_clear(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ while (btree->evict_ref != NULL) {
+ F_SET(cache, WT_EVICT_CLEAR_WALKS);
+ WT_RET(__wt_cond_wait(
+ session, cache->evict_waiter_cond, 100000));
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_evict_page --
+ * Evict a given page.
+ */
+int
+__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_ISOLATION saved_iso;
+
+ /*
+ * We have to take care when evicting pages not to write a change that:
+ * (a) is not yet committed; or
+ * (b) is committed more recently than an in-progress checkpoint.
+ *
+ * We handle both of these cases by setting up the transaction context
+ * before evicting, using a special "eviction" isolation level, where
+ * only globally visible updates can be evicted.
+ */
+ __wt_txn_update_oldest(session);
+ txn = &session->txn;
+ saved_iso = txn->isolation;
+ txn->isolation = TXN_ISO_EVICTION;
+
+ /*
+ * Sanity check: if a transaction has updates, its updates should not
+ * be visible to eviction.
+ */
+ WT_ASSERT(session,
+ !F_ISSET(txn, TXN_HAS_ID) || !__wt_txn_visible(session, txn->id));
+
+ ret = __wt_rec_evict(session, ref, 0);
+ txn->isolation = saved_iso;
+
+ return (ret);
+}
+
+/*
+ * __wt_evict_file_exclusive_on --
+ * Get exclusive eviction access to a file and discard any of the file's
+ * blocks queued for eviction.
+ */
+int
+__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ u_int i, elem;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ /*
+ * Hold the walk lock to set the "no eviction" flag: no new pages from
+ * the file will be queued for eviction after this point.
+ */
+ __wt_spin_lock(session, &cache->evict_walk_lock);
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+ __wt_spin_unlock(session, &cache->evict_walk_lock);
+
+ /* Clear any existing LRU eviction walk for the file. */
+ WT_RET(__evict_tree_walk_clear(session));
+
+ /* Hold the evict lock to remove any queued pages from this file. */
+ __wt_spin_lock(session, &cache->evict_lock);
+
+ /*
+ * The eviction candidate list might reference pages from the file,
+ * clear it.
+ */
+ elem = cache->evict_max;
+ for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+ if (evict->btree == btree)
+ __evict_list_clear(session, evict);
+ __wt_spin_unlock(session, &cache->evict_lock);
+
+ /*
+ * We have disabled further eviction: wait for concurrent LRU eviction
+ * activity to drain.
+ */
+ while (btree->evict_busy > 0)
+ __wt_yield();
+
+ return (0);
+}
+
+/*
+ * __wt_evict_file_exclusive_off --
+ * Release exclusive eviction access to a file.
+ */
+void
+__wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ WT_ASSERT(session, btree->evict_ref == NULL);
+
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+}
+
+/*
+ * __evict_lru_pages --
+ * Get pages from the LRU queue to evict.
+ */
+static int
+__evict_lru_pages(WT_SESSION_IMPL *session, int is_app)
+{
+ WT_DECL_RET;
+
+ /*
+ * Reconcile and discard some pages: EBUSY is returned if a page fails
+ * eviction because it's unavailable, continue in that case.
+ */
+ while ((ret = __wt_evict_lru_page(session, is_app)) == 0 ||
+ ret == EBUSY)
+ ;
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __evict_lru --
+ * Evict pages from the cache based on their read generation.
+ */
+static int
+__evict_lru(WT_SESSION_IMPL *session, uint32_t flags)
+{
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ uint64_t cutoff;
+ uint32_t candidates, entries, i;
+
+ cache = S2C(session)->cache;
+
+ /* Get some more pages to consider for eviction. */
+ WT_RET(__evict_walk(session, &entries, flags));
+
+ /* Sort the list into LRU order and restart. */
+ __wt_spin_lock(session, &cache->evict_lock);
+
+ qsort(cache->evict,
+ entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
+
+ while (entries > 0 && cache->evict[entries - 1].ref == NULL)
+ --entries;
+
+ cache->evict_entries = entries;
+
+ if (entries == 0) {
+ /*
+ * If there are no entries, there cannot be any candidates.
+ * Make sure application threads don't read past the end of the
+ * candidate list, or they may race with the next walk.
+ */
+ cache->evict_candidates = 0;
+ cache->evict_current = NULL;
+ __wt_spin_unlock(session, &cache->evict_lock);
+ return (0);
+ }
+
+ WT_ASSERT(session, cache->evict[0].ref != NULL);
+
+ /* Find the bottom 25% of read generations. */
+ cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
+ __evict_read_gen(&cache->evict[entries - 1])) / 4;
+
+ /*
+ * Don't take less than 10% or more than 50% of entries, regardless.
+ * That said, if there is only one entry, which is normal when
+ * populating an empty file, don't exclude it.
+ */
+ for (candidates = 1 + entries / 10;
+ candidates < entries / 2;
+ candidates++)
+ if (__evict_read_gen(&cache->evict[candidates]) > cutoff)
+ break;
+ cache->evict_candidates = candidates;
+
+ /* If we have more than the minimum number of entries, clear them. */
+ if (cache->evict_entries > WT_EVICT_WALK_BASE) {
+ for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i;
+ i < cache->evict_entries;
+ i++, evict++)
+ __evict_list_clear(session, evict);
+ cache->evict_entries = WT_EVICT_WALK_BASE;
+ }
+
+ cache->evict_current = cache->evict;
+ __wt_spin_unlock(session, &cache->evict_lock);
+
+ /*
+ * The eviction server thread doesn't do any actual eviction if there
+ * are multiple eviction workers running.
+ */
+ WT_RET(__wt_cond_signal(session, cache->evict_waiter_cond));
+
+ if (S2C(session)->evict_workers > 1) {
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_server_not_evicting);
+ /*
+ * If there are candidates queued, give other threads a chance
+ * to access them before gathering more.
+ */
+ if (candidates > 10 && cache->evict_current != NULL)
+ __wt_yield();
+ } else {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_server_evicting);
+ WT_RET(__evict_lru_pages(session, 0));
+ }
+
+ return (0);
+}
+
+/*
+ * __evict_walk --
+ * Fill in the array by walking the next set of pages.
+ */
+static int
+__evict_walk(WT_SESSION_IMPL *session, u_int *entriesp, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ u_int max_entries, old_slot, retries, slot;
+
+ conn = S2C(session);
+ cache = S2C(session)->cache;
+ retries = 0;
+
+ /* Increment the shared read generation. */
+ __wt_cache_read_gen_incr(session);
+
+ /*
+ * Update the oldest ID: we use it to decide whether pages are
+ * candidates for eviction. Without this, if all threads are blocked
+ * after a long-running transaction (such as a checkpoint) completes,
+ * we may never start evicting again.
+ */
+ __wt_txn_update_oldest(session);
+
+ /*
+ * Set the starting slot in the queue and the maximum pages added
+ * per walk.
+ */
+ slot = cache->evict_entries;
+ max_entries = slot + WT_EVICT_WALK_INCR;
+ if (cache->evict_current == NULL)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
+ else
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty);
+
+ /*
+ * Lock the dhandle list so sweeping cannot change the pointers out
+ * from under us.
+ *
+ * NOTE: we don't hold the schema lock, so we have to take care
+ * that the handles we see are open and valid.
+ */
+ __wt_spin_lock(session, &conn->dhandle_lock);
+
+retry: SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ /* Ignore non-file handles, or handles that aren't open. */
+ if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ continue;
+
+ /*
+ * Each time we reenter this function, start at the next handle
+ * on the list.
+ */
+ if (cache->evict_file_next != NULL &&
+ cache->evict_file_next != dhandle)
+ continue;
+ cache->evict_file_next = NULL;
+
+ /* Skip files that don't allow eviction. */
+ btree = dhandle->handle;
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ continue;
+
+ /*
+ * Also skip files that are configured to stick in cache until
+ * we get aggressive.
+ */
+ if (btree->evict_priority != 0 &&
+ !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ continue;
+
+ /*
+ * If we are filling the queue, skip files that haven't been
+ * useful in the past.
+ */
+ if (btree->evict_walk_period != 0 &&
+ cache->evict_entries >= WT_EVICT_WALK_INCR &&
+ btree->evict_walk_skips++ < btree->evict_walk_period)
+ continue;
+ btree->evict_walk_skips = 0;
+ old_slot = slot;
+
+ __wt_spin_lock(session, &cache->evict_walk_lock);
+
+ /*
+ * Re-check the "no eviction" flag -- it is used to enforce
+ * exclusive access when a handle is being closed.
+ */
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ WT_WITH_BTREE(session, btree,
+ ret = __evict_walk_file(session, &slot, flags));
+
+ __wt_spin_unlock(session, &cache->evict_walk_lock);
+
+ /*
+ * If we didn't find enough candidates in the file, skip it
+ * next time.
+ */
+ if (slot >= old_slot + WT_EVICT_WALK_PER_FILE ||
+ slot >= max_entries)
+ btree->evict_walk_period = 0;
+ else
+ btree->evict_walk_period = WT_MIN(
+ WT_MAX(1, 2 * btree->evict_walk_period), 1000);
+
+ if (ret != 0 || slot >= max_entries)
+ break;
+ }
+
+ /* Walk the list of files a few times if we don't find enough pages. */
+ if (ret == 0 && slot < max_entries && ++retries < 10)
+ goto retry;
+
+ /* Remember the file we should visit first, next loop. */
+ if (dhandle != NULL)
+ dhandle = SLIST_NEXT(dhandle, l);
+ cache->evict_file_next = dhandle;
+
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+
+ *entriesp = slot;
+ return (ret);
+}
+
+/*
+ * __evict_init_candidate --
+ * Initialize a WT_EVICT_ENTRY structure with a given page.
+ */
+static void
+__evict_init_candidate(
+ WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_REF *ref)
+{
+ WT_CACHE *cache;
+ u_int slot;
+
+ cache = S2C(session)->cache;
+
+ /* Keep track of the maximum slot we are using. */
+ slot = (u_int)(evict - cache->evict);
+ if (slot >= cache->evict_max)
+ cache->evict_max = slot + 1;
+
+ if (evict->ref != NULL)
+ __evict_list_clear(session, evict);
+ evict->ref = ref;
+ evict->btree = S2BT(session);
+
+ /* Mark the page on the list */
+ F_SET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU);
+}
+
+/*
+ * __evict_walk_file --
+ * Get a few page eviction candidates from a single underlying file.
+ */
+static int
+__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ WT_EVICT_ENTRY *end, *evict, *start;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ uint64_t pages_walked;
+ uint32_t walk_flags;
+ int internal_pages, modified, restarts;
+
+ btree = S2BT(session);
+ cache = S2C(session)->cache;
+ start = cache->evict + *slotp;
+ end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
+ cache->evict + cache->evict_slots);
+
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
+ /*
+ * Get some more eviction candidate pages.
+ */
+ for (evict = start, pages_walked = 0, internal_pages = restarts = 0;
+ evict < end && (ret == 0 || ret == WT_NOTFOUND);
+ ret = __wt_tree_walk(session, &btree->evict_ref, walk_flags),
+ ++pages_walked) {
+ if (btree->evict_ref == NULL) {
+ /*
+ * Take care with terminating this loop.
+ *
+ * Don't make an extra call to __wt_tree_walk: that will
+ * leave a page pinned, which may prevent any work from
+ * being done.
+ */
+ if (++restarts == 2)
+ break;
+ continue;
+ }
+
+ /* Ignore root pages entirely. */
+ if (__wt_ref_is_root(btree->evict_ref))
+ continue;
+ page = btree->evict_ref->page;
+
+ /*
+ * Use the EVICT_LRU flag to avoid putting pages onto the list
+ * multiple times.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ continue;
+
+ /* Limit internal pages to 50% unless we get aggressive. */
+ if ((page->type == WT_PAGE_COL_INT ||
+ page->type == WT_PAGE_ROW_INT) &&
+ ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 &&
+ !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ break;
+
+ /*
+ * If this page has never been considered for eviction,
+ * set its read generation to a little bit in the
+ * future and move on, give readers a chance to start
+ * updating the read generation.
+ */
+ if (page->read_gen == WT_READGEN_NOTSET) {
+ page->read_gen = __wt_cache_read_gen_set(session);
+ continue;
+ }
+
+ /*
+ * If the file is being checkpointed, there's a period of time
+ * where we can't discard dirty pages because of possible races
+ * with the checkpointing thread.
+ */
+ modified = __wt_page_is_modified(page);
+ if (modified && btree->checkpointing)
+ continue;
+
+ /* Optionally ignore clean pages. */
+ if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+ continue;
+
+ /*
+ * If the page is clean but has modifications that appear too
+ * new to evict, skip it.
+ */
+ mod = page->modify;
+ if (!modified && mod != NULL &&
+ !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ continue;
+
+ /*
+ * If the oldest transaction hasn't changed since the
+ * last time this page was written, it's unlikely that
+ * we can make progress. Similarly, if the most recent
+ * update on the page is not yet globally visible,
+ * eviction will fail. These heuristics attempt to
+ * avoid repeated attempts to evict the same page.
+ *
+ * That said, if eviction is stuck, or the file is
+ * being checkpointed, try anyway: maybe a transaction
+ * that was running last time we wrote the page has
+ * since rolled back, or we can help get the checkpoint
+ * completed sooner.
+ */
+ if (modified && !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
+ !btree->checkpointing &&
+ (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
+ !__wt_txn_visible_all(session, mod->update_txn)))
+ continue;
+
+ WT_ASSERT(session, evict->ref == NULL);
+ __evict_init_candidate(session, evict, btree->evict_ref);
+ ++evict;
+
+ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
+ "select: %p, size %" PRIu64, page, page->memory_footprint));
+ }
+
+ /* If the walk was interrupted by a locked page, that's okay. */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ *slotp += (u_int)(evict - start);
+ WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
+ return (ret);
+}
+
+/*
+ * __evict_get_ref --
+ * Get a page for eviction.
+ */
+static int
+__evict_get_ref(
+ WT_SESSION_IMPL *session, int is_app, WT_BTREE **btreep, WT_REF **refp)
+{
+ WT_CACHE *cache;
+ WT_EVICT_ENTRY *evict;
+ uint32_t candidates;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ cache = S2C(session)->cache;
+ *btreep = NULL;
+ *refp = NULL;
+
+ /*
+ * A pathological case: if we're the oldest transaction in the system
+ * and the eviction server is stuck trying to find space, abort the
+ * transaction to give up all hazard pointers before trying again.
+ */
+ if (is_app && F_ISSET(cache, WT_EVICT_STUCK) &&
+ __wt_txn_am_oldest(session)) {
+ F_CLR(cache, WT_EVICT_STUCK);
+ WT_STAT_FAST_CONN_INCR(session, txn_fail_cache);
+ return (WT_ROLLBACK);
+ }
+
+ /*
+ * Avoid the LRU lock if no pages are available. If there are pages
+ * available, spin until we get the lock. If this function returns
+ * without getting a page to evict, application threads assume there
+ * are no more pages available and will attempt to wake the eviction
+ * server.
+ */
+ for (;;) {
+ if (cache->evict_current == NULL)
+ return (WT_NOTFOUND);
+ if (__wt_spin_trylock(session, &cache->evict_lock, &id) == 0)
+ break;
+ __wt_yield();
+ }
+
+ /*
+ * The eviction server only tries to evict half of the pages before
+ * looking for more.
+ */
+ candidates = cache->evict_candidates;
+ if (!is_app && candidates > 1)
+ candidates /= 2;
+
+ /* Get the next page queued for eviction. */
+ while ((evict = cache->evict_current) != NULL &&
+ evict < cache->evict + candidates && evict->ref != NULL) {
+ WT_ASSERT(session, evict->btree != NULL);
+
+ /* Move to the next item. */
+ ++cache->evict_current;
+
+ /*
+ * Lock the page while holding the eviction mutex to prevent
+ * multiple attempts to evict it. For pages that are already
+ * being evicted, this operation will fail and we will move on.
+ */
+ if (!WT_ATOMIC_CAS4(
+ evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+ __evict_list_clear(session, evict);
+ continue;
+ }
+
+ /*
+ * Increment the busy count in the btree handle to prevent it
+ * from being closed under us.
+ */
+ (void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1);
+
+ *btreep = evict->btree;
+ *refp = evict->ref;
+
+ /*
+ * Remove the entry so we never try to reconcile the same page
+ * on reconciliation error.
+ */
+ __evict_list_clear(session, evict);
+ break;
+ }
+
+ /* Clear the current pointer if there are no more candidates. */
+ if (evict >= cache->evict + cache->evict_candidates)
+ cache->evict_current = NULL;
+ __wt_spin_unlock(session, &cache->evict_lock);
+
+ return ((*refp == NULL) ? WT_NOTFOUND : 0);
+}
+
+/*
+ * __wt_evict_lru_page --
+ * Called by both eviction and application threads to evict a page.
+ */
+int
+__wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
+{
+ WT_BTREE *btree;
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *ref;
+
+ WT_RET(__evict_get_ref(session, is_app, &btree, &ref));
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+ /*
+ * In case something goes wrong, don't pick the same set of pages every
+ * time.
+ *
+ * We used to bump the page's read generation only if eviction failed,
+ * but that isn't safe: at that point, eviction has already unlocked
+ * the page and some other thread may have evicted it by the time we
+ * look at it.
+ */
+ page = ref->page;
+ if (page->read_gen != WT_READGEN_OLDEST)
+ page->read_gen = __wt_cache_read_gen_set(session);
+
+ WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref));
+
+ (void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+
+ WT_RET(ret);
+
+ cache = S2C(session)->cache;
+ if (F_ISSET(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK))
+ F_CLR(cache, WT_EVICT_NO_PROGRESS | WT_EVICT_STUCK);
+
+ return (ret);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_cache_dump --
+ * Dump debugging information to stdout about the size of the files in the
+ * cache.
+ *
+ * NOTE: this function is not called anywhere, it is intended to be called
+ * from a debugger.
+ */
+void
+__wt_cache_dump(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_REF *next_walk;
+ WT_PAGE *page;
+ uint64_t file_intl_pages, file_leaf_pages;
+ uint64_t file_bytes, file_dirty, total_bytes;
+
+ conn = S2C(session);
+ total_bytes = 0;
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ continue;
+
+ btree = dhandle->handle;
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ continue;
+
+ file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
+ next_walk = NULL;
+ session->dhandle = dhandle;
+ while (__wt_tree_walk(session,
+ &next_walk, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 &&
+ next_walk != NULL) {
+ page = next_walk->page;
+ if (page->type == WT_PAGE_COL_INT ||
+ page->type == WT_PAGE_ROW_INT)
+ ++file_intl_pages;
+ else
+ ++file_leaf_pages;
+ file_bytes += page->memory_footprint;
+ if (__wt_page_is_modified(page))
+ file_dirty += page->memory_footprint;
+ }
+ session->dhandle = NULL;
+
+ printf("cache dump: %s [%s]:"
+ " %" PRIu64 " intl pages, %" PRIu64 " leaf pages,"
+ " %" PRIu64 "MB, %" PRIu64 "MB dirty\n",
+ dhandle->name, dhandle->checkpoint,
+ file_intl_pages, file_leaf_pages,
+ file_bytes >> 20, file_dirty >> 20);
+
+ total_bytes += file_bytes;
+ }
+ printf("cache dump: total found = %" PRIu64 "MB"
+ " vs tracked inuse %" PRIu64 "MB\n",
+ total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
+ fflush(stdout);
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
new file mode 100644
index 00000000000..a21d6d277d3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -0,0 +1,770 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
+static int __btree_get_last_recno(WT_SESSION_IMPL *);
+static int __btree_page_sizes(WT_SESSION_IMPL *);
+static int __btree_preload(WT_SESSION_IMPL *);
+static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int);
+
+static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
+static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int);
+
+/*
+ * __wt_btree_open --
+ * Open a Btree.
+ */
+int
+__wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT ckpt;
+ WT_CONFIG_ITEM cval;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ size_t root_addr_size;
+ uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int creation, forced_salvage, readonly;
+ const char *filename;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ /* Checkpoint files are readonly. */
+ readonly = dhandle->checkpoint == NULL ? 0 : 1;
+
+ /* Get the checkpoint information for this name/checkpoint pair. */
+ WT_CLEAR(ckpt);
+ WT_RET(__wt_meta_checkpoint(
+ session, dhandle->name, dhandle->checkpoint, &ckpt));
+
+ /*
+ * Bulk-load is only permitted on newly created files, not any empty
+ * file -- see the checkpoint code for a discussion.
+ */
+ creation = ckpt.raw.size == 0;
+ if (!creation && F_ISSET(btree, WT_BTREE_BULK))
+ WT_ERR_MSG(session, EINVAL,
+ "bulk-load is only supported on newly created objects");
+
+ /* Handle salvage configuration. */
+ forced_salvage = 0;
+ if (F_ISSET(btree, WT_BTREE_SALVAGE)) {
+ WT_ERR(__wt_config_gets(session, op_cfg, "force", &cval));
+ forced_salvage = (cval.val != 0);
+ }
+
+ /* Initialize and configure the WT_BTREE structure. */
+ WT_ERR(__btree_conf(session, &ckpt));
+
+ /* Connect to the underlying block manager. */
+ filename = dhandle->name;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_ERR_MSG(session, EINVAL, "expected a 'file:' URI");
+
+ WT_ERR(__wt_block_manager_open(session, filename, dhandle->cfg,
+ forced_salvage, readonly, btree->allocsize, &btree->bm));
+ bm = btree->bm;
+
+ /*
+ * !!!
+ * As part of block-manager configuration, we need to return the maximum
+ * sized address cookie that a block manager will ever return. There's
+ * a limit of WT_BTREE_MAX_ADDR_COOKIE, but at 255B, it's too large for
+ * a Btree with 512B internal pages. The default block manager packs
+ * a wt_off_t and 2 uint32_t's into its cookie, so there's no problem
+ * now, but when we create a block manager extension API, we need some
+ * way to consider the block manager's maximum cookie size versus the
+ * minimum Btree internal node size.
+ */
+ btree->block_header = bm->block_header(bm);
+
+ /*
+ * Open the specified checkpoint unless it's a special command (special
+ * commands are responsible for loading their own checkpoints, if any).
+ */
+ if (!F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) {
+ /*
+ * There are two reasons to load an empty tree rather than a
+ * checkpoint: either there is no checkpoint (the file is
+ * being created), or the load call returns no root page (the
+ * checkpoint is for an empty file).
+ */
+ WT_ERR(bm->checkpoint_load(bm, session,
+ ckpt.raw.data, ckpt.raw.size,
+ root_addr, &root_addr_size, readonly));
+ if (creation || root_addr_size == 0)
+ WT_ERR(__btree_tree_open_empty(
+ session, creation, readonly));
+ else {
+ WT_ERR(__wt_btree_tree_open(
+ session, root_addr, root_addr_size));
+
+ /* Warm the cache, if possible. */
+ WT_ERR(__btree_preload(session));
+
+ /* Get the last record number in a column-store file. */
+ if (btree->type != BTREE_ROW)
+ WT_ERR(__btree_get_last_recno(session));
+ }
+ }
+
+ if (0) {
+err: WT_TRET(__wt_btree_close(session));
+ }
+ __wt_meta_checkpoint_free(session, &ckpt);
+
+ return (ret);
+}
+
+/*
+ * __wt_btree_close --
+ * Close a Btree.
+ */
+int
+__wt_btree_close(WT_SESSION_IMPL *session)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ if ((bm = btree->bm) != NULL) {
+ /* Unload the checkpoint, unless it's a special command. */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+ WT_TRET(bm->checkpoint_unload(bm, session));
+
+ /* Close the underlying block manager reference. */
+ WT_TRET(bm->close(bm, session));
+
+ btree->bm = NULL;
+ }
+
+ /* Close the Huffman tree. */
+ __wt_btree_huffman_close(session);
+
+ /* Destroy locks. */
+ WT_TRET(__wt_rwlock_destroy(session, &btree->ovfl_lock));
+ __wt_spin_destroy(session, &btree->flush_lock);
+
+ /* Free allocated memory. */
+ __wt_free(session, btree->key_format);
+ __wt_free(session, btree->value_format);
+
+ if (btree->collator_owned) {
+ if (btree->collator->terminate != NULL)
+ WT_TRET(btree->collator->terminate(
+ btree->collator, &session->iface));
+ btree->collator_owned = 0;
+ }
+ btree->collator = NULL;
+
+ btree->bulk_load_ok = 0;
+
+ return (ret);
+}
+
+/*
+ * __btree_conf --
+ * Configure a WT_BTREE structure.
+ */
+static int
+__btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_NAMED_COMPRESSOR *ncomp;
+ int64_t maj_version, min_version;
+ uint32_t bitcnt;
+ int fixed;
+ const char **cfg;
+
+ btree = S2BT(session);
+ conn = S2C(session);
+ cfg = btree->dhandle->cfg;
+
+ /* Dump out format information. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_VERSION)) {
+ WT_RET(__wt_config_gets(session, cfg, "version.major", &cval));
+ maj_version = cval.val;
+ WT_RET(__wt_config_gets(session, cfg, "version.minor", &cval));
+ min_version = cval.val;
+ WT_RET(__wt_verbose(session, WT_VERB_VERSION,
+ "%" PRIu64 ".%" PRIu64, maj_version, min_version));
+ }
+
+ /* Get the file ID. */
+ WT_RET(__wt_config_gets(session, cfg, "id", &cval));
+ btree->id = (uint32_t)cval.val;
+
+ /* Validate file types and check the data format plan. */
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+ if (WT_STRING_MATCH("r", cval.str, cval.len))
+ btree->type = BTREE_COL_VAR;
+ else
+ btree->type = BTREE_ROW;
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->key_format));
+
+ WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &btree->value_format));
+
+ /* Row-store key comparison and key gap for prefix compression. */
+ if (btree->type == BTREE_ROW) {
+ WT_RET(__wt_collator_config(
+ session, cfg, &btree->collator, &btree->collator_owned));
+
+ WT_RET(__wt_config_gets(session, cfg, "key_gap", &cval));
+ btree->key_gap = (uint32_t)cval.val;
+ }
+
+ /* Column-store: check for fixed-size data. */
+ if (btree->type == BTREE_COL_VAR) {
+ WT_RET(__wt_struct_check(
+ session, cval.str, cval.len, &fixed, &bitcnt));
+ if (fixed) {
+ if (bitcnt == 0 || bitcnt > 8)
+ WT_RET_MSG(session, EINVAL,
+ "fixed-width field sizes must be greater "
+ "than 0 and less than or equal to 8");
+ btree->bitcnt = (uint8_t)bitcnt;
+ btree->type = BTREE_COL_FIX;
+ }
+ }
+
+ /* Page sizes */
+ WT_RET(__btree_page_sizes(session));
+
+ /* Eviction; the metadata file is never evicted. */
+ if (WT_IS_METADATA(btree->dhandle))
+ F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+ else {
+ WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
+ if (cval.val)
+ F_SET(btree, WT_BTREE_NO_EVICTION | WT_BTREE_NO_HAZARD);
+ else
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+ }
+
+ /* Checksums */
+ WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
+ if (WT_STRING_MATCH("on", cval.str, cval.len))
+ btree->checksum = CKSUM_ON;
+ else if (WT_STRING_MATCH("off", cval.str, cval.len))
+ btree->checksum = CKSUM_OFF;
+ else
+ btree->checksum = CKSUM_UNCOMPRESSED;
+
+ /* Huffman encoding */
+ WT_RET(__wt_btree_huffman_open(session));
+
+ /*
+ * Reconciliation configuration:
+ * Block compression (all)
+ * Dictionary compression (variable-length column-store, row-store)
+ * Page-split percentage
+ * Prefix compression (row-store)
+ * Suffix compression (row-store)
+ */
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ break;
+ case BTREE_ROW:
+ WT_RET(__wt_config_gets(
+ session, cfg, "internal_key_truncate", &cval));
+ btree->internal_key_truncate = cval.val == 0 ? 0 : 1;
+
+ WT_RET(__wt_config_gets(
+ session, cfg, "prefix_compression", &cval));
+ btree->prefix_compression = cval.val == 0 ? 0 : 1;
+ WT_RET(__wt_config_gets(
+ session, cfg, "prefix_compression_min", &cval));
+ btree->prefix_compression_min = (u_int)cval.val;
+ /* FALLTHROUGH */
+ case BTREE_COL_VAR:
+ WT_RET(__wt_config_gets(session, cfg, "dictionary", &cval));
+ btree->dictionary = (u_int)cval.val;
+ break;
+ }
+
+ WT_RET(__wt_config_gets(session, cfg, "block_compressor", &cval));
+ if (cval.len > 0) {
+ TAILQ_FOREACH(ncomp, &conn->compqh, q)
+ if (WT_STRING_MATCH(ncomp->name, cval.str, cval.len)) {
+ btree->compressor = ncomp->compressor;
+ break;
+ }
+ if (btree->compressor == NULL)
+ WT_RET_MSG(session, EINVAL,
+ "unknown block compressor '%.*s'",
+ (int)cval.len, cval.str);
+ }
+
+ /* Initialize locks. */
+ WT_RET(__wt_rwlock_alloc(
+ session, &btree->ovfl_lock, "btree overflow lock"));
+ WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock"));
+
+ __wt_stat_init_dsrc_stats(&btree->dhandle->stats);
+
+ btree->write_gen = ckpt->write_gen; /* Write generation */
+ btree->modified = 0; /* Clean */
+
+ return (0);
+}
+
+/*
+ * __wt_root_ref_init --
+ * Initialize a tree root reference, and link in the root page.
+ */
+void
+__wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno)
+{
+ memset(root_ref, 0, sizeof(*root_ref));
+
+ root_ref->page = root;
+ root_ref->state = WT_REF_MEM;
+
+ root_ref->key.recno = is_recno ? 1 : 0;
+
+ root->pg_intl_parent_ref = root_ref;
+}
+
+/*
+ * __wt_btree_tree_open --
+ * Read in a tree from disk.
+ */
+int
+__wt_btree_tree_open(
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_ITEM dsk;
+ WT_PAGE *page;
+
+ btree = S2BT(session);
+
+ /*
+ * A buffer into which we read a root page; don't use a scratch buffer,
+ * the buffer's allocated memory becomes the persistent in-memory page.
+ */
+ WT_CLEAR(dsk);
+
+ /* Read the page, then build the in-memory version of the page. */
+ WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size));
+ WT_ERR(__wt_page_inmem(session, NULL, dsk.data,
+ WT_DATA_IN_ITEM(&dsk) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED , &page));
+
+ /* Finish initializing the root, root reference links. */
+ __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
+
+ if (0) {
+err: __wt_buf_free(session, &dsk);
+ }
+ return (ret);
+}
+
+/*
+ * __btree_tree_open_empty --
+ * Create an empty in-memory tree.
+ */
+static int
+__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *root, *leaf;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref;
+
+ btree = S2BT(session);
+ root = leaf = NULL;
+
+ /*
+ * Newly created objects can be used for cursor inserts or for bulk
+ * loads; set a flag that's cleared when a row is inserted into the
+ * tree. Objects being bulk-loaded cannot be evicted, we set it
+ * globally, there's no point in searching empty trees for eviction.
+ */
+ if (creation) {
+ btree->bulk_load_ok = 1;
+ __wt_btree_evictable(session, 0);
+ }
+
+ /*
+ * A note about empty trees: the initial tree is a root page and a leaf
+ * page. We need a pair of pages instead of just a single page because
+ * we can reconcile the leaf page while the root stays pinned in memory.
+ * If the pair is evicted without being modified, that's OK, nothing is
+ * ever written.
+ *
+ * Create the root and leaf pages.
+ *
+ * !!!
+ * Be cautious about changing the order of updates in this code: to call
+ * __wt_page_out on error, we require a correct page setup at each point
+ * where we might fail.
+ */
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ WT_ERR(
+ __wt_page_alloc(session, WT_PAGE_COL_INT, 1, 1, 1, &root));
+ root->pg_intl_parent_ref = &btree->root;
+
+ pindex = WT_INTL_INDEX_COPY(root);
+ ref = pindex->index[0];
+ ref->home = root;
+ WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+ ref->page = leaf;
+ ref->addr = NULL;
+ ref->state = WT_REF_MEM;
+ ref->key.recno = 1;
+ break;
+ case BTREE_ROW:
+ WT_ERR(
+ __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, 1, 1, &root));
+ root->pg_intl_parent_ref = &btree->root;
+
+ pindex = WT_INTL_INDEX_COPY(root);
+ ref = pindex->index[0];
+ ref->home = root;
+ WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
+ ref->page = leaf;
+ ref->addr = NULL;
+ ref->state = WT_REF_MEM;
+ WT_ERR(__wt_row_ikey_incr(
+ session, root, 0, "", 1, &ref->key.ikey));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * Mark the leaf page dirty: we didn't create an entirely valid root
+ * page (specifically, the root page's disk address isn't set, and it's
+ * the act of reconciling the leaf page that makes it work, we don't
+ * try and use the original disk address of modified pages). We could
+ * get around that by leaving the leaf page clean and building a better
+ * root page, but then we get into trouble because a checkpoint marks
+ * the root page dirty to force a write, and without reconciling the
+ * leaf page we won't realize there's no records to write, we'll write
+ * a root page, which isn't correct for an empty tree.
+ *
+ * Earlier versions of this code kept the leaf page clean, but with the
+ * "empty" flag set in the leaf page's modification structure; in that
+ * case, checkpoints works (forced reconciliation of a root with a
+ * single "empty" page wouldn't write any blocks). That version had
+ * memory leaks because the eviction code didn't correctly handle pages
+ * that were "clean" (and so never reconciled), yet "modified" with an
+ * "empty" flag. The goal of this code is to mimic a real tree that
+ * simply has no records, for whatever reason, and trust reconciliation
+ * to figure out it's empty and not write any blocks.
+ *
+ * We do not set the tree's modified flag because the checkpoint code
+ * skips unmodified files in closing checkpoints (checkpoints that
+ * don't require a write unless the file is actually dirty). There's
+ * no need to reconcile this file unless the application does a real
+ * checkpoint or it's actually modified.
+ *
+ * Only do this for a live tree, not for checkpoints. If we open an
+ * empty checkpoint, the leaf page cannot be dirty or eviction may try
+ * to write it, which will fail because checkpoints are read-only.
+ */
+ if (!readonly) {
+ WT_ERR(__wt_page_modify_init(session, leaf));
+ __wt_page_only_modify_set(session, leaf);
+ }
+
+ /* Finish initializing the root, root reference links. */
+ __wt_root_ref_init(&btree->root, root, btree->type != BTREE_ROW);
+
+ return (0);
+
+err: if (leaf != NULL)
+ __wt_page_out(session, &leaf);
+ if (root != NULL)
+ __wt_page_out(session, &root);
+ return (ret);
+}
+
+/*
+ * __wt_btree_new_leaf_page --
+ * Create an empty leaf page and link it into a reference in its parent.
+ */
+int
+__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_COL_FIX, 1, 0, 1, pagep));
+ break;
+ case BTREE_COL_VAR:
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_COL_VAR, 1, 0, 1, pagep));
+ break;
+ case BTREE_ROW:
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, 1, pagep));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __wt_btree_evictable --
+ * Setup or release a cache-resident tree.
+ */
+void
+__wt_btree_evictable(WT_SESSION_IMPL *session, int on)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* The metadata file is never evicted. */
+ if (on && !WT_IS_METADATA(btree->dhandle))
+ F_CLR(btree, WT_BTREE_NO_EVICTION);
+ else
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+}
+
+/*
+ * __btree_preload --
+ * Pre-load internal pages.
+ */
+static int
+__btree_preload(WT_SESSION_IMPL *session)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_REF *ref;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /* Pre-load the second-level internal pages. */
+ WT_INTL_FOREACH_BEGIN(session, btree->root.page, ref) {
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ if (addr != NULL)
+ WT_RET(bm->preload(bm, session, addr, addr_size));
+ } WT_INTL_FOREACH_END;
+ return (0);
+}
+
+/*
+ * __btree_get_last_recno --
+ * Set the last record number for a column-store.
+ */
+static int
+__btree_get_last_recno(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_REF *next_walk;
+
+ btree = S2BT(session);
+
+ next_walk = NULL;
+ WT_RET(__wt_tree_walk(session, &next_walk, WT_READ_PREV));
+ if (next_walk == NULL)
+ return (WT_NOTFOUND);
+
+ page = next_walk->page;
+ btree->last_recno = page->type == WT_PAGE_COL_VAR ?
+ __col_var_last_recno(page) : __col_fix_last_recno(page);
+
+ return (__wt_page_release(session, next_walk, 0));
+}
+
+/*
+ * __btree_page_sizes --
+ * Verify the page sizes. Some of these sizes are automatically checked
+ * using limits defined in the API, don't duplicate the logic here.
+ */
+static int
+__btree_page_sizes(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ uint64_t cache_size;
+ uint32_t intl_split_size, leaf_split_size;
+ const char **cfg;
+
+ btree = S2BT(session);
+ cfg = btree->dhandle->cfg;
+
+ WT_RET(__wt_direct_io_size_check(
+ session, cfg, "allocation_size", &btree->allocsize));
+ WT_RET(__wt_direct_io_size_check(
+ session, cfg, "internal_page_max", &btree->maxintlpage));
+ WT_RET(__wt_config_gets(session, cfg, "internal_item_max", &cval));
+ btree->maxintlitem = (uint32_t)cval.val;
+ WT_RET(__wt_direct_io_size_check(
+ session, cfg, "leaf_page_max", &btree->maxleafpage));
+ WT_RET(__wt_config_gets(session, cfg, "leaf_item_max", &cval));
+ btree->maxleafitem = (uint32_t)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval));
+ btree->split_pct = (int)cval.val;
+
+ /*
+ * When a page is forced to split, we want at least 50 entries on its
+ * parent.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
+ btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * btree->maxleafpage);
+
+ /*
+ * Don't let pages grow to more than half the cache size. Otherwise,
+ * with very small caches, we can end up in a situation where nothing
+ * can be evicted. Take care getting the cache size: with a shared
+ * cache, it may not have been set.
+ */
+ cache_size = S2C(session)->cache_size;
+ if (cache_size > 0)
+ btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 2);
+
+ /* Allocation sizes must be a power-of-two, nothing else makes sense. */
+ if (!__wt_ispo2(btree->allocsize))
+ WT_RET_MSG(session,
+ EINVAL, "the allocation size must be a power of two");
+
+ /* All page sizes must be in units of the allocation size. */
+ if (btree->maxintlpage < btree->allocsize ||
+ btree->maxintlpage % btree->allocsize != 0 ||
+ btree->maxleafpage < btree->allocsize ||
+ btree->maxleafpage % btree->allocsize != 0)
+ WT_RET_MSG(session, EINVAL,
+ "page sizes must be a multiple of the page allocation "
+ "size (%" PRIu32 "B)", btree->allocsize);
+
+ /*
+ * Set the split percentage: reconciliation splits to a smaller-than-
+ * maximum page size so we don't split every time a new entry is added.
+ */
+ intl_split_size = __wt_split_page_size(btree, btree->maxintlpage);
+ leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage);
+
+ /*
+ * Default values for internal and leaf page items: make sure at least
+ * 8 items fit on split pages.
+ */
+ if (btree->maxintlitem == 0)
+ btree->maxintlitem = intl_split_size / 8;
+ if (btree->maxleafitem == 0)
+ btree->maxleafitem = leaf_split_size / 8;
+
+ /*
+ * If raw compression is configured, the application owns page layout,
+ * it's not our problem. Hopefully the application chose well.
+ */
+ if (btree->compressor != NULL &&
+ btree->compressor->compress_raw != NULL)
+ return (0);
+
+ /* Check we can fit at least 2 items on a page. */
+ if (btree->maxintlitem > btree->maxintlpage / 2)
+ return (pse1(session, "internal",
+ btree->maxintlpage, btree->maxintlitem));
+ if (btree->maxleafitem > btree->maxleafpage / 2)
+ return (pse1(session, "leaf",
+ btree->maxleafpage, btree->maxleafitem));
+
+ /*
+ * Take into account the size of a split page:
+ *
+ * Make it a separate error message so it's clear what went wrong.
+ */
+ if (btree->maxintlitem > intl_split_size / 2)
+ return (pse2(session, "internal",
+ btree->maxintlpage, btree->maxintlitem, btree->split_pct));
+ if (btree->maxleafitem > leaf_split_size / 2)
+ return (pse2(session, "leaf",
+ btree->maxleafpage, btree->maxleafitem, btree->split_pct));
+
+ return (0);
+}
+
+/*
+ * __wt_split_page_size --
+ * Split page size calculation: we don't want to repeatedly split every
+ * time a new entry is added, so we split to a smaller-than-maximum page size.
+ */
+uint32_t
+__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize)
+{
+ uintmax_t a;
+ uint32_t split_size;
+
+ /*
+ * Ideally, the split page size is some percentage of the maximum page
+ * size rounded to an allocation unit (round to an allocation unit so
+ * we don't waste space when we write).
+ */
+ a = maxpagesize; /* Don't overflow. */
+ split_size = (uint32_t)
+ WT_ALIGN((a * (u_int)btree->split_pct) / 100, btree->allocsize);
+
+ /*
+ * If the result of that calculation is the same as the allocation unit
+ * (that happens if the maximum size is the same size as an allocation
+ * unit, use a percentage of the maximum page size).
+ */
+ if (split_size == btree->allocsize)
+ split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100);
+
+ return (split_size);
+}
+
+/*
+ * pse1 --
+ * Page size error message 1.
+ */
+static int
+pse1(WT_SESSION_IMPL *session, const char *type, uint32_t max, uint32_t ovfl)
+{
+ WT_RET_MSG(session, EINVAL,
+ "%s page size (%" PRIu32 "B) too small for the maximum item size "
+ "(%" PRIu32 "B); the page must be able to hold at least 2 items",
+ type, max, ovfl);
+}
+
+/*
+ * pse2 --
+ * Page size error message 2.
+ */
+static int
+pse2(WT_SESSION_IMPL *session,
+ const char *type, uint32_t max, uint32_t ovfl, int pct)
+{
+ WT_RET_MSG(session, EINVAL,
+ "%s page size (%" PRIu32 "B) too small for the maximum item size "
+ "(%" PRIu32 "B), because of the split percentage (%d %%); a split "
+ "page must be able to hold at least 2 items",
+ type, max, ovfl, pct);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c
new file mode 100644
index 00000000000..aa6e7c36451
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * 7-bit ASCII, with English language frequencies.
+ *
+ * Based on "Case-sensitive letter and bigram frequency counts from large-scale
+ * English corpora"
+ * Michael N. Jones and D.J.K. Mewhort
+ * Queen's University, Kingston, Ontario, Canada
+ * Behavior Research Methods, Instruments, & Computers 2004, 36 (3), 388-396
+ *
+ * Additionally supports space and tab characters; space is the most common
+ * character in text where it occurs, and tab appears about as frequently as
+ * 'a' and 'n' in text where it occurs.
+ */
+struct __wt_huffman_table {
+ uint32_t symbol;
+ uint32_t frequency;
+};
+static const struct __wt_huffman_table __wt_huffman_nytenglish[] = {
+ /* nul */ { 0x00, 0 }, /* For an escape character. */
+ /* ht */ { 0x09, 5263779 },
+ /* sp */ { 0x20, 8000000 },
+ /* ! */ { 0x21, 2178 },
+ /* " */ { 0x22, 284671 },
+ /* # */ { 0x23, 10 },
+ /* $ */ { 0x24, 51572 },
+ /* % */ { 0x25, 1993 },
+ /* & */ { 0x26, 6523 },
+ /* ' */ { 0x27, 204497 },
+ /* ( */ { 0x28, 53398 },
+ /* ) */ { 0x29, 53735 },
+ /* * */ { 0x2a, 20716 },
+ /* + */ { 0x2b, 309 },
+ /* , */ { 0x2c, 984969 },
+ /* - */ { 0x2d, 252302 },
+ /* . */ { 0x2e, 946136 },
+ /* / */ { 0x2f, 8161 },
+ /* 0 */ { 0x30, 546233 },
+ /* 1 */ { 0x31, 460946 },
+ /* 2 */ { 0x32, 333499 },
+ /* 3 */ { 0x33, 187606 },
+ /* 4 */ { 0x34, 192528 },
+ /* 5 */ { 0x35, 374413 },
+ /* 6 */ { 0x36, 153865 },
+ /* 7 */ { 0x37, 120094 },
+ /* 8 */ { 0x38, 182627 },
+ /* 9 */ { 0x39, 282364 },
+ /* : */ { 0x3a, 54036 },
+ /* ; */ { 0x3b, 36727 },
+ /* < */ { 0x3c, 82 },
+ /* = */ { 0x3d, 22 },
+ /* > */ { 0x3e, 83 },
+ /* ? */ { 0x3f, 12357 },
+ /* @ */ { 0x40, 1 },
+ /* A */ { 0x41, 280937 },
+ /* B */ { 0x42, 169474 },
+ /* C */ { 0x43, 229363 },
+ /* D */ { 0x44, 129632 },
+ /* E */ { 0x45, 138443 },
+ /* F */ { 0x46, 100751 },
+ /* G */ { 0x47, 93212 },
+ /* H */ { 0x48, 123632 },
+ /* I */ { 0x49, 223312 },
+ /* J */ { 0x4a, 78706 },
+ /* K */ { 0x4b, 46580 },
+ /* L */ { 0x4c, 106984 },
+ /* M */ { 0x4d, 259474 },
+ /* N */ { 0x4e, 205409 },
+ /* O */ { 0x4f, 105700 },
+ /* P */ { 0x50, 144239 },
+ /* Q */ { 0x51, 11659 },
+ /* R */ { 0x52, 146448 },
+ /* S */ { 0x53, 304971 },
+ /* T */ { 0x54, 325462 },
+ /* U */ { 0x55, 57488 },
+ /* V */ { 0x56, 31053 },
+ /* W */ { 0x57, 107195 },
+ /* X */ { 0x58, 7578 },
+ /* Y */ { 0x59, 94297 },
+ /* Z */ { 0x5a, 5610 },
+ /* [ */ { 0x5b, 1 },
+ /* \ */ { 0x5c, 1 },
+ /* ] */ { 0x5d, 1 },
+ /* ^ */ { 0x5e, 1 },
+ /* _ */ { 0x5f, 1 },
+ /* ` */ { 0x60, 1 },
+ /* a */ { 0x61, 5263779 },
+ /* b */ { 0x62, 866156 },
+ /* c */ { 0x63, 1960412 },
+ /* d */ { 0x64, 2369820 },
+ /* e */ { 0x65, 7741842 },
+ /* f */ { 0x66, 1296925 },
+ /* g */ { 0x67, 1206747 },
+ /* h */ { 0x68, 2955858 },
+ /* i */ { 0x69, 4527332 },
+ /* j */ { 0x6a, 65856 },
+ /* k */ { 0x6b, 460788 },
+ /* l */ { 0x6c, 2553152 },
+ /* m */ { 0x6d, 1467376 },
+ /* n */ { 0x6e, 4535545 },
+ /* o */ { 0x6f, 4729266 },
+ /* p */ { 0x70, 1255579 },
+ /* q */ { 0x71, 54221 },
+ /* r */ { 0x72, 4137949 },
+ /* s */ { 0x73, 4186210 },
+ /* t */ { 0x74, 5507692 },
+ /* u */ { 0x75, 1613323 },
+ /* v */ { 0x76, 653370 },
+ /* w */ { 0x77, 1015656 },
+ /* x */ { 0x78, 123577 },
+ /* y */ { 0x79, 1062040 },
+ /* z */ { 0x7a, 66423 },
+ /* { */ { 0x7b, 1 },
+ /* | */ { 0x7c, 1 },
+ /* } */ { 0x7d, 1 },
+ /* ~ */ { 0x7e, 1 }
+};
+
+static int __wt_huffman_read(WT_SESSION_IMPL *,
+ WT_CONFIG_ITEM *, struct __wt_huffman_table **, u_int *, u_int *);
+
+/*
+ * __wt_btree_huffman_open --
+ * Configure Huffman encoding for the tree.
+ */
+int
+__wt_btree_huffman_open(WT_SESSION_IMPL *session)
+{
+ struct __wt_huffman_table *table;
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM key_conf, value_conf;
+ WT_DECL_RET;
+ const char **cfg;
+ u_int entries, numbytes;
+
+ btree = S2BT(session);
+ cfg = btree->dhandle->cfg;
+
+ WT_RET(__wt_config_gets(session, cfg, "huffman_key", &key_conf));
+ WT_RET(__wt_config_gets(session, cfg, "huffman_value", &value_conf));
+ if (key_conf.len == 0 && value_conf.len == 0)
+ return (0);
+
+ switch (btree->type) { /* Check file type compatibility. */
+ case BTREE_COL_FIX:
+ WT_RET_MSG(session, EINVAL,
+ "fixed-size column-store files may not be Huffman encoded");
+ case BTREE_COL_VAR:
+ if (key_conf.len != 0)
+ WT_RET_MSG(session, EINVAL,
+ "the keys of variable-length column-store files "
+ "may not be Huffman encoded");
+ break;
+ case BTREE_ROW:
+ break;
+ }
+
+ if (strncasecmp(key_conf.str, "english", key_conf.len) == 0) {
+ struct __wt_huffman_table
+ copy[WT_ELEMENTS(__wt_huffman_nytenglish)];
+
+ memcpy(copy,
+ __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish));
+ WT_RET(__wt_huffman_open(session, copy,
+ WT_ELEMENTS(__wt_huffman_nytenglish),
+ 1, &btree->huffman_key));
+
+ /* Check for a shared key/value table. */
+ if (strncasecmp(
+ value_conf.str, "english", value_conf.len) == 0) {
+ btree->huffman_value = btree->huffman_key;
+ return (0);
+ }
+ } else {
+ WT_RET(__wt_huffman_read(
+ session, &key_conf, &table, &entries, &numbytes));
+ ret = __wt_huffman_open(session, table,
+ entries, numbytes, &btree->huffman_key);
+ __wt_free(session, table);
+ if (ret != 0)
+ return (ret);
+
+ /* Check for a shared key/value table. */
+ if (value_conf.len != 0 && key_conf.len == value_conf.len &&
+ memcmp(key_conf.str, value_conf.str, key_conf.len) == 0) {
+ btree->huffman_value = btree->huffman_key;
+ return (0);
+ }
+ }
+ if (strncasecmp(value_conf.str, "english", value_conf.len) == 0) {
+ struct __wt_huffman_table
+ copy[WT_ELEMENTS(__wt_huffman_nytenglish)];
+
+ memcpy(copy,
+ __wt_huffman_nytenglish, sizeof(__wt_huffman_nytenglish));
+ WT_RET(__wt_huffman_open(session, copy,
+ WT_ELEMENTS(__wt_huffman_nytenglish),
+ 1, &btree->huffman_value));
+ } else {
+ WT_RET(__wt_huffman_read(
+ session, &value_conf, &table, &entries, &numbytes));
+ ret = __wt_huffman_open(session, table,
+ entries, numbytes, &btree->huffman_value);
+ __wt_free(session, table);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_huffman_read --
+ * Read a Huffman table from a file.
+ */
+static int
+__wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
+ struct __wt_huffman_table **tablep, u_int *entriesp, u_int *numbytesp)
+{
+ struct __wt_huffman_table *table, *tp;
+ FILE *fp;
+ WT_DECL_RET;
+ uint64_t symbol, frequency;
+ u_int entries, lineno;
+ char *file;
+
+ *tablep = NULL;
+ *entriesp = *numbytesp = 0;
+
+ fp = NULL;
+ file = NULL;
+ table = NULL;
+
+ /*
+ * UTF-8 table is 256 bytes, with a range of 0-255.
+ * UTF-16 is 128KB (2 * 65536) bytes, with a range of 0-65535.
+ */
+ if (strncasecmp(ip->str, "utf8", 4) == 0) {
+ entries = UINT8_MAX;
+ *numbytesp = 1;
+ WT_ERR(__wt_calloc_def(session, entries, &table));
+
+ if (ip->len == 4)
+ WT_ERR_MSG(session, EINVAL,
+ "no Huffman table file name specified");
+ WT_ERR(__wt_calloc_def(session, ip->len, &file));
+ memcpy(file, ip->str + 4, ip->len - 4);
+ } else if (strncasecmp(ip->str, "utf16", 5) == 0) {
+ entries = UINT16_MAX;
+ *numbytesp = 2;
+ WT_ERR(__wt_calloc_def(session, entries, &table));
+
+ if (ip->len == 5)
+ WT_ERR_MSG(session, EINVAL,
+ "no Huffman table file name specified");
+ WT_ERR(__wt_calloc_def(session, ip->len, &file));
+ memcpy(file, ip->str + 5, ip->len - 5);
+ } else {
+ WT_ERR_MSG(session, EINVAL,
+ "unknown Huffman configuration value %.*s",
+ (int)ip->len, ip->str);
+ }
+
+ if ((fp = fopen(file, "r")) == NULL)
+ WT_ERR_MSG(session, __wt_errno(),
+ "unable to read Huffman table file %.*s",
+ (int)ip->len, ip->str);
+
+ for (tp = table, lineno = 1; (ret =
+ fscanf(fp, "%" SCNu64 " %" SCNu64, &symbol, &frequency)) != EOF;
+ ++tp, ++lineno) {
+ if (lineno > entries)
+ WT_ERR_MSG(session, EINVAL,
+ "Huffman table file %.*s is corrupted, "
+ "more than %" PRIu32 " entries",
+ (int)ip->len, ip->str, entries);
+ if (ret != 2)
+ WT_ERR_MSG(session, EINVAL,
+ "line %u of Huffman table file %.*s is corrupted: "
+ "expected two unsigned integral values",
+ lineno, (int)ip->len, ip->str);
+ if (symbol > entries)
+ WT_ERR_MSG(session, EINVAL,
+ "line %u of Huffman table file %.*s is corrupted: "
+ "symbol larger than maximum value of %u",
+ lineno, (int)ip->len, ip->str, entries);
+ if (frequency > UINT32_MAX)
+ WT_ERR_MSG(session, EINVAL,
+ "line %u of Huffman table file %.*s is corrupted: "
+ "frequency larger than maximum value of %" PRIu32,
+ lineno, (int)ip->len, ip->str, UINT32_MAX);
+
+ tp->symbol = (uint32_t)symbol;
+ tp->frequency = (uint32_t)frequency;
+ }
+
+ *entriesp = lineno - 1;
+ *tablep = table;
+
+ if (0) {
+err: __wt_free(session, table);
+ }
+ if (fp != NULL)
+ (void)fclose(fp);
+ __wt_free(session, file);
+ return (ret);
+}
+
+/*
+ * __wt_btree_huffman_close --
+ * Close the Huffman tables.
+ */
+void
+__wt_btree_huffman_close(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ if (btree->huffman_key != NULL) {
+ /* Key and data may use the same table, only close it once. */
+ if (btree->huffman_value == btree->huffman_key)
+ btree->huffman_value = NULL;
+
+ __wt_huffman_close(session, btree->huffman_key);
+ btree->huffman_key = NULL;
+ }
+ if (btree->huffman_value != NULL) {
+ __wt_huffman_close(session, btree->huffman_value);
+ btree->huffman_value = NULL;
+ }
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
new file mode 100644
index 00000000000..ccc67c994dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_read --
+ * Read a cookie referenced block into a buffer.
+ */
+int
+__wt_bt_read(WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ size_t result_len;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /*
+ * If anticipating a compressed block, read into a scratch buffer and
+ * decompress into the caller's buffer. Else, read directly into the
+ * caller's buffer.
+ */
+ if (btree->compressor == NULL) {
+ WT_RET(bm->read(bm, session, buf, addr, addr_size));
+ dsk = buf->data;
+ } else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
+ dsk = tmp->data;
+ }
+
+ /*
+ * If the block is compressed, copy the skipped bytes of the original
+ * image into place, then decompress.
+ */
+ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
+ if (btree->compressor == NULL ||
+ btree->compressor->decompress == NULL)
+ WT_ERR_MSG(session, WT_ERROR,
+ "read compressed block where no compression engine "
+ "configured");
+
+ /*
+ * We're allocating the exact number of bytes we're expecting
+ * from decompression.
+ */
+ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));
+
+ /*
+ * Note the source length is NOT the number of compressed bytes,
+ * it's the length of the block we just read (minus the skipped
+ * bytes). We don't store the number of compressed bytes: some
+ * compression engines need that length stored externally, they
+ * don't have markers in the stream to signal the end of the
+ * compressed bytes. Those engines must store the compressed
+ * byte length somehow, see the snappy compression extension for
+ * an example.
+ */
+ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP);
+ ret = btree->compressor->decompress(
+ btree->compressor, &session->iface,
+ (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+ tmp->size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
+ dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);
+
+ /*
+ * If checksums were turned off because we're depending on the
+ * decompression to fail on any corrupted data, we'll end up
+ * here after corruption happens. If we're salvaging the file,
+ * it's OK, otherwise it's really, really bad.
+ */
+ if (ret != 0 ||
+ result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+ WT_ERR(
+ F_ISSET(btree, WT_BTREE_VERIFY) ||
+ F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+ WT_ERROR :
+ __wt_illegal_value(session, btree->dhandle->name));
+ } else
+ if (btree->compressor == NULL)
+ buf->size = dsk->mem_size;
+ else
+ /*
+ * We guessed wrong: there was a compressor, but this
+ * block was not compressed, and now the page is in the
+ * wrong buffer and the buffer may be of the wrong size.
+ * This should be rare, but happens with small blocks
+ * that aren't worth compressing.
+ */
+ WT_ERR(__wt_buf_set(
+ session, buf, tmp->data, dsk->mem_size));
+
+ /* If the handle is a verify handle, verify the physical page. */
+ if (F_ISSET(btree, WT_BTREE_VERIFY)) {
+ if (tmp == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
+ WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, cache_read);
+ WT_STAT_FAST_DATA_INCR(session, cache_read);
+ if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
+ WT_STAT_FAST_DATA_INCR(session, compress_read);
+ WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
+ WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_bt_write --
+ * Write a buffer into a block, returning the block's addr/size and
+ * checksum.
+ */
+int
+__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
+ uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_ITEM *ip;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER *dsk;
+ size_t len, src_len, dst_len, result_len, size;
+ int data_cksum, compression_failed;
+ uint8_t *src, *dst;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /* Checkpoint calls are different than standard calls. */
+ WT_ASSERT(session,
+ (checkpoint == 0 && addr != NULL && addr_sizep != NULL) ||
+ (checkpoint == 1 && addr == NULL && addr_sizep == NULL));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * We're passed a table's disk image. Decompress if necessary and
+ * verify the image. Always check the in-memory length for accuracy.
+ */
+ dsk = buf->mem;
+ WT_ASSERT(session, dsk->u.entries != 0);
+ if (compressed) {
+ WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+
+ memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
+ WT_ERR(btree->compressor->decompress(
+ btree->compressor, &session->iface,
+ (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
+ buf->size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+ tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
+ &result_len));
+ WT_ASSERT(session,
+ dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
+ tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
+ ip = tmp;
+ } else {
+ WT_ASSERT(session, dsk->mem_size == buf->size);
+ ip = buf;
+ }
+ WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
+ __wt_scr_free(&tmp);
+#endif
+
+ /*
+ * Optionally stream-compress the data, but don't compress blocks that
+ * are already as small as they're going to get.
+ */
+ if (btree->compressor == NULL ||
+ btree->compressor->compress == NULL || compressed)
+ ip = buf;
+ else if (buf->size <= btree->allocsize) {
+ ip = buf;
+ WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
+ } else {
+ /* Skip the header bytes of the source data. */
+ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
+ src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;
+
+ /*
+ * Compute the size needed for the destination buffer. We only
+ * allocate enough memory for a copy of the original by default,
+ * if any compressed version is bigger than the original, we
+ * won't use it. However, some compression engines (snappy is
+ * one example), may need more memory because they don't stop
+ * just because there's no more memory into which to compress.
+ */
+ if (btree->compressor->pre_size == NULL)
+ len = src_len;
+ else
+ WT_ERR(btree->compressor->pre_size(btree->compressor,
+ &session->iface, src, src_len, &len));
+
+ size = len + WT_BLOCK_COMPRESS_SKIP;
+ WT_ERR(bm->write_size(bm, session, &size));
+ WT_ERR(__wt_scr_alloc(session, size, &tmp));
+
+ /* Skip the header bytes of the destination data. */
+ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
+ dst_len = len;
+
+ compression_failed = 0;
+ WT_ERR(btree->compressor->compress(btree->compressor,
+ &session->iface,
+ src, src_len,
+ dst, dst_len,
+ &result_len, &compression_failed));
+ result_len += WT_BLOCK_COMPRESS_SKIP;
+
+ /*
+ * If compression fails, or doesn't gain us at least one unit of
+ * allocation, fallback to the original version. This isn't
+ * unexpected: if compression doesn't work for some chunk of
+ * data for some reason (noting likely additional format/header
+ * information which compressed output requires), it just means
+ * the uncompressed version is as good as it gets, and that's
+ * what we use.
+ */
+ if (compression_failed ||
+ buf->size / btree->allocsize ==
+ result_len / btree->allocsize) {
+ ip = buf;
+ WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
+ } else {
+ compressed = 1;
+ WT_STAT_FAST_DATA_INCR(session, compress_write);
+
+ /*
+ * Copy in the skipped header bytes, set the final data
+ * size.
+ */
+ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
+ tmp->size = result_len;
+ ip = tmp;
+ }
+ }
+ dsk = ip->mem;
+
+ /* If the buffer is compressed, set the flag. */
+ if (compressed)
+ F_SET(dsk, WT_PAGE_COMPRESSED);
+
+ /*
+ * We increment the block's write generation so it's easy to identify
+ * newer versions of blocks during salvage. (It's common in WiredTiger,
+ * at least for the default block manager, for multiple blocks to be
+ * internally consistent with identical first and last keys, so we need
+ * a way to know the most recent state of the block. We could check
+ * which leaf is referenced by a valid internal page, but that implies
+ * salvaging internal pages, which I don't want to do, and it's not
+ * as good anyway, because the internal page may not have been written
+ * after the leaf page was updated. So, write generations it is.
+ *
+ * Nothing is locked at this point but two versions of a page with the
+ * same generation is pretty unlikely, and if we did, they're going to
+ * be roughly identical for the purposes of salvage, anyway.
+ */
+ dsk->write_gen = ++btree->write_gen;
+
+ /*
+ * Checksum the data if the buffer isn't compressed or checksums are
+ * configured.
+ */
+ switch (btree->checksum) {
+ case CKSUM_ON:
+ data_cksum = 1;
+ break;
+ case CKSUM_OFF:
+ data_cksum = 0;
+ break;
+ case CKSUM_UNCOMPRESSED:
+ default:
+ data_cksum = !compressed;
+ break;
+ }
+
+ /* Call the block manager to write the block. */
+ WT_ERR(checkpoint ?
+ bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
+ bm->write(bm, session, ip, addr, addr_sizep, data_cksum));
+
+ WT_STAT_FAST_CONN_INCR(session, cache_write);
+ WT_STAT_FAST_DATA_INCR(session, cache_write);
+ WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size);
+ WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_misc.c b/src/third_party/wiredtiger/src/btree/bt_misc.c
new file mode 100644
index 00000000000..cba1c0c61aa
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_misc.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_type_string --
+ * Return a string representing the page type.
+ */
+const char *
+__wt_page_type_string(u_int type)
+{
+ switch (type) {
+ case WT_PAGE_INVALID:
+ return ("invalid");
+ case WT_PAGE_BLOCK_MANAGER:
+ return ("block manager");
+ case WT_PAGE_COL_FIX:
+ return ("column-store fixed-length leaf");
+ case WT_PAGE_COL_INT:
+ return ("column-store internal");
+ case WT_PAGE_COL_VAR:
+ return ("column-store variable-length leaf");
+ case WT_PAGE_OVFL:
+ return ("overflow");
+ case WT_PAGE_ROW_INT:
+ return ("row-store internal");
+ case WT_PAGE_ROW_LEAF:
+ return ("row-store leaf");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_cell_type_string --
+ * Return a string representing the cell type.
+ */
+const char *
+__wt_cell_type_string(uint8_t type)
+{
+ switch (type) {
+ case WT_CELL_ADDR_DEL:
+ return ("addr/del");
+ case WT_CELL_ADDR_INT:
+ return ("addr/int");
+ case WT_CELL_ADDR_LEAF:
+ return ("addr/leaf");
+ case WT_CELL_ADDR_LEAF_NO:
+ return ("addr/leaf-no");
+ case WT_CELL_DEL:
+ return ("deleted");
+ case WT_CELL_KEY:
+ return ("key");
+ case WT_CELL_KEY_PFX:
+ return ("key/pfx");
+ case WT_CELL_KEY_OVFL:
+ return ("key/ovfl");
+ case WT_CELL_KEY_SHORT:
+ return ("key/short");
+ case WT_CELL_KEY_SHORT_PFX:
+ return ("key/short,pfx");
+ case WT_CELL_KEY_OVFL_RM:
+ return ("key/ovfl,rm");
+ case WT_CELL_VALUE:
+ return ("value");
+ case WT_CELL_VALUE_COPY:
+ return ("value/copy");
+ case WT_CELL_VALUE_OVFL:
+ return ("value/ovfl");
+ case WT_CELL_VALUE_OVFL_RM:
+ return ("value/ovfl,rm");
+ case WT_CELL_VALUE_SHORT:
+ return ("value/short");
+ default:
+ return ("unknown");
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_page_addr_string --
+ * Figure out a page's "address" and load a buffer with a printable,
+ * nul-terminated representation of that address.
+ */
+const char *
+__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
+{
+ size_t addr_size;
+ const uint8_t *addr;
+
+ if (__wt_ref_is_root(ref)) {
+ buf->data = "[Root]";
+ buf->size = strlen("[Root]");
+ return (buf->data);
+ }
+
+ (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
+ return (__wt_addr_string(session, addr, addr_size, buf));
+}
+
+/*
+ * __wt_addr_string --
+ * Load a buffer with a printable, nul-terminated representation of an
+ * address.
+ */
+const char *
+__wt_addr_string(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, WT_ITEM *buf)
+{
+ WT_BM *bm;
+
+ bm = S2BT(session)->bm;
+
+ if (addr == NULL) {
+ buf->data = "[NoAddr]";
+ buf->size = strlen("[NoAddr]");
+ } else if (bm->addr_string(bm, session, buf, addr, addr_size) != 0) {
+ buf->data = "[Error]";
+ buf->size = strlen("[Error]");
+ }
+ return (buf->data);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
new file mode 100644
index 00000000000..4cd317f1e8f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __ovfl_read --
+ * Read an overflow item from the disk.
+ */
+static int
+__ovfl_read(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+ WT_BTREE *btree;
+ const WT_PAGE_HEADER *dsk;
+
+ btree = S2BT(session);
+
+ /*
+ * Read the overflow item from the block manager, then reference the
+ * start of the data and set the data's length.
+ *
+ * Overflow reads are synchronous. That may bite me at some point, but
+ * WiredTiger supports large page sizes, overflow items should be rare.
+ */
+ WT_RET(__wt_bt_read(session, store, addr, addr_size));
+ dsk = store->data;
+ store->data = WT_PAGE_HEADER_BYTE(btree, dsk);
+ store->size = dsk->u.datalen;
+
+ WT_STAT_FAST_DATA_INCR(session, cache_read_overflow);
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_read --
+ * Bring an overflow item into memory.
+ */
+int
+__wt_ovfl_read(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_DECL_RET;
+
+ /*
+ * If no page specified, there's no need to lock and there's no cache
+ * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
+ */
+ if (page == NULL)
+ return (
+ __ovfl_read(session, unpack->data, unpack->size, store));
+
+ /*
+ * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow
+ * value, but there was still a reader in the system that might need it,
+ * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM
+ * and we will be passed a page so we can look-aside into the cache of
+ * such values.
+ *
+ * Acquire the overflow lock, and retest the on-page cell's value inside
+ * the lock.
+ */
+ WT_RET(__wt_readlock(session, S2BT(session)->ovfl_lock));
+ ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ?
+ __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) :
+ __ovfl_read(session, unpack->data, unpack->size, store);
+ WT_TRET(__wt_readunlock(session, S2BT(session)->ovfl_lock));
+
+ return (ret);
+}
+
+/*
+ * __ovfl_cache_col_visible --
+ * column-store: check for a globally visible update.
+ */
+static int
+__ovfl_cache_col_visible(
+ WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
+{
+ /*
+ * Column-store is harder than row_store: we're here because there's a
+ * reader in the system that might read the original version of an
+ * overflow record, which might match a number of records. For example,
+ * the original overflow value was for records 100-200, we've replaced
+ * each of those records individually, but there exists a reader that
+ * might read any one of those records, and all of those records have
+ * different update entries with different transaction IDs. Since it's
+ * infeasible to determine if there's a globally visible update for each
+ * reader for each record, we test the simple case where a single record
+ * has a single, globally visible update. If that's not the case, cache
+ * the value.
+ */
+ if (__wt_cell_rle(unpack) == 1 &&
+ upd != NULL && /* Sanity: upd should always be set. */
+ __wt_txn_visible_all(session, upd->txnid))
+ return (1);
+ return (0);
+}
+
+/*
+ * __ovfl_cache_row_visible --
+ * row-store: check for a globally visible update.
+ */
+static int
+__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
+{
+ WT_UPDATE *upd;
+
+ /* Check to see if there's a globally visible update. */
+ for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
+ if (__wt_txn_visible_all(session, upd->txnid))
+ return (1);
+
+ return (0);
+}
+
+/*
+ * __ovfl_cache --
+ * Cache a deleted overflow value.
+ */
+static int
+__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ addr = unpack->data;
+ addr_size = unpack->size;
+
+ WT_RET(__wt_scr_alloc(session, 1024, &tmp));
+
+ /* Enter the value into the overflow cache. */
+ WT_ERR(__ovfl_read(session, addr, addr_size, tmp));
+ WT_ERR(__wt_ovfl_txnc_add(
+ session, page, addr, addr_size, tmp->data, tmp->size));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_ovfl_cache --
+ * Handle deletion of an overflow value.
+ */
+int
+__wt_ovfl_cache(WT_SESSION_IMPL *session,
+ WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack)
+{
+ int visible;
+
+ /*
+ * This function solves a problem in reconciliation. The scenario is:
+ * - reconciling a leaf page that references an overflow item
+ * - the item is updated and the update committed
+ * - a checkpoint runs, freeing the backing overflow blocks
+ * - a snapshot transaction wants the original version of the item
+ *
+ * In summary, we may need the original version of an overflow item for
+ * a snapshot transaction after the item was deleted from a page that's
+ * subsequently been checkpointed, where the checkpoint must know about
+ * the freed blocks. We don't have any way to delay a free of the
+ * underlying blocks until a particular set of transactions exit (and
+ * this shouldn't be a common scenario), so cache the overflow value in
+ * memory.
+ *
+ * This gets hard because the snapshot transaction reader might:
+ * - search the WT_UPDATE list and not find an useful entry
+ * - read the overflow value's address from the on-page cell
+ * - go to sleep
+ * - checkpoint runs, caches the overflow value, frees the blocks
+ * - another thread allocates and overwrites the blocks
+ * - the reader wakes up and reads the wrong value
+ *
+ * Use a read/write lock and the on-page cell to fix the problem: hold
+ * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
+ * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
+ * item.
+ *
+ * The read/write lock is per btree, but it could be per page or even
+ * per overflow item. We don't do any of that because overflow values
+ * are supposed to be rare and we shouldn't see contention for the lock.
+ *
+ * Check for a globally visible update. If there is a globally visible
+ * update, we don't need to cache the item because it's not possible for
+ * a running thread to have moved past it.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_VAR:
+ visible = __ovfl_cache_col_visible(session, cookie, vpack);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ visible = __ovfl_cache_row_visible(session, page, cookie);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * If there's no globally visible update, there's a reader in the system
+ * that might try and read the old value, cache it.
+ */
+ if (!visible) {
+ WT_RET(__ovfl_cache(session, page, vpack));
+ WT_STAT_FAST_DATA_INCR(session, cache_overflow_value);
+ }
+
+ /*
+ * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
+ * underlying overflow value's blocks to be freed when reconciliation
+ * completes.
+ */
+ return (__wt_ovfl_discard_add(session, page, vpack->cell));
+}
+
+/*
+ * __wt_ovfl_discard --
+ * Discard an on-page overflow value, and reset the page's cell.
+ */
+int
+__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+
+ __wt_cell_unpack(cell, unpack);
+
+ /*
+ * Finally remove overflow key/value objects, called when reconciliation
+ * finishes after successfully writing a page.
+ *
+ * Keys must have already been instantiated and value objects must have
+ * already been cached (if they might potentially still be read by any
+ * running transaction).
+ *
+ * Acquire the overflow lock to avoid racing with a thread reading the
+ * backing overflow blocks.
+ */
+ WT_RET(__wt_writelock(session, btree->ovfl_lock));
+
+ switch (unpack->raw) {
+ case WT_CELL_KEY_OVFL:
+ __wt_cell_type_reset(session,
+ unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
+ break;
+ case WT_CELL_VALUE_OVFL:
+ __wt_cell_type_reset(session,
+ unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_TRET(__wt_writeunlock(session, btree->ovfl_lock));
+
+ /* Free the backing disk blocks. */
+ WT_TRET(bm->free(bm, session, unpack->data, unpack->size));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
new file mode 100644
index 00000000000..c5f24c06286
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -0,0 +1,734 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
+static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *);
+static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
+static int __inmem_row_leaf_entries(
+ WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
+
+/*
+ * __evict_force_check --
+ * Check if a page matches the criteria for forced eviction.
+ */
+static int
+__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Pages are usually small enough, check that first. */
+ if (page->memory_footprint < btree->maxmempage)
+ return (0);
+
+ /* Leaf pages only. */
+ if (page->type != WT_PAGE_COL_FIX &&
+ page->type != WT_PAGE_COL_VAR &&
+ page->type != WT_PAGE_ROW_LEAF)
+ return (0);
+
+ /* Eviction may be turned off, although that's rare. */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (0);
+
+ /*
+ * It's hard to imagine a page with a huge memory footprint that has
+ * never been modified, but check to be sure.
+ */
+ if (page->modify == NULL)
+ return (0);
+
+ /* Trigger eviction on the next page release. */
+ page->read_gen = WT_READGEN_OLDEST;
+
+ return (1);
+}
+
+/*
+ * __wt_page_in_func --
+ * Acquire a hazard pointer to a page; if the page is not in-memory,
+ * read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ int busy, force_attempts, oldgen;
+
+ for (force_attempts = oldgen = 0;;) {
+ switch (ref->state) {
+ case WT_REF_DISK:
+ case WT_REF_DELETED:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+
+ /*
+ * The page isn't in memory, attempt to read it.
+ * Make sure there is space in the cache.
+ */
+ WT_RET(__wt_cache_full_check(session));
+ WT_RET(__wt_cache_read(session, ref));
+ oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+ F_ISSET(session, WT_SESSION_NO_CACHE);
+ continue;
+ case WT_REF_READING:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+ /* FALLTHROUGH */
+ case WT_REF_LOCKED:
+ if (LF_ISSET(WT_READ_NO_WAIT))
+ return (WT_NOTFOUND);
+ /* The page is busy -- wait. */
+ break;
+ case WT_REF_SPLIT:
+ return (WT_RESTART);
+ case WT_REF_MEM:
+ /*
+ * The page is in memory: get a hazard pointer, update
+ * the page's LRU and return. The expected reason we
+ * can't get a hazard pointer is because the page is
+ * being evicted; yield and try again.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(
+ __wt_hazard_set(session, ref, &busy, file, line));
+#else
+ WT_RET(__wt_hazard_set(session, ref, &busy));
+#endif
+ if (busy)
+ break;
+
+ page = ref->page;
+ WT_ASSERT(session, page != NULL);
+
+ /* Forcibly evict pages that are too big. */
+ if (!LF_ISSET(WT_READ_NO_EVICT) &&
+ force_attempts < 10 &&
+ __evict_force_check(session, page)) {
+ ++force_attempts;
+ WT_RET(__wt_page_release(session, ref, flags));
+ break;
+ }
+
+ /* Check if we need an autocommit transaction. */
+ if ((ret = __wt_txn_autocommit_check(session)) != 0) {
+ WT_TRET(__wt_hazard_clear(session, page));
+ return (ret);
+ }
+
+ /*
+ * If we read the page and we are configured to not
+ * trash the cache, set the oldest read generation so
+ * the page is forcibly evicted as soon as possible.
+ *
+ * Otherwise, update the page's read generation.
+ */
+ if (oldgen && page->read_gen == WT_READGEN_NOTSET)
+ page->read_gen = WT_READGEN_OLDEST;
+ else if (!LF_ISSET(WT_READ_NO_GEN) &&
+ page->read_gen < __wt_cache_read_gen(session))
+ page->read_gen =
+ __wt_cache_read_gen_set(session);
+
+ return (0);
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* We failed to get the page -- yield before retrying. */
+ __wt_yield();
+ }
+}
+
+/*
+ * __wt_page_alloc --
+ * Create or read a page into the cache.
+ */
+int
+__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
+ uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep)
+{
+ WT_CACHE *cache;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ size_t size;
+ uint32_t i;
+ void *p;
+
+ *pagep = NULL;
+
+ cache = S2C(session)->cache;
+ page = NULL;
+
+ size = sizeof(WT_PAGE);
+ switch (type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Variable-length column-store leaf page: allocate memory to
+ * describe the page's contents with the initial allocation.
+ */
+ size += alloc_entries * sizeof(WT_COL);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Row-store leaf page: allocate memory to describe the page's
+ * contents with the initial allocation.
+ */
+ size += alloc_entries * sizeof(WT_ROW);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_calloc(session, 1, size, &page));
+
+ page->type = type;
+ page->read_gen = WT_READGEN_NOTSET;
+
+ switch (type) {
+ case WT_PAGE_COL_FIX:
+ page->pg_fix_recno = recno;
+ page->pg_fix_entries = alloc_entries;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ page->pg_intl_recno = recno;
+
+ /*
+ * Internal pages have an array of references to objects so they
+ * can split. Allocate the array of references and optionally,
+ * the objects to which they point.
+ */
+ WT_ERR(__wt_calloc(session, 1,
+ sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *),
+ &p));
+ size +=
+ sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *);
+ pindex = p;
+ pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1);
+ pindex->entries = alloc_entries;
+ WT_INTL_INDEX_SET(page, pindex);
+ if (alloc_refs)
+ for (i = 0; i < pindex->entries; ++i) {
+ WT_ERR(__wt_calloc_def(
+ session, 1, &pindex->index[i]));
+ size += sizeof(WT_REF);
+ }
+ if (0) {
+err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) {
+ for (i = 0; i < pindex->entries; ++i)
+ __wt_free(session, pindex->index[i]);
+ __wt_free(session, pindex);
+ }
+ __wt_free(session, page);
+ return (ret);
+ }
+ break;
+ case WT_PAGE_COL_VAR:
+ page->pg_var_recno = recno;
+ page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE));
+ page->pg_var_entries = alloc_entries;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE));
+ page->pg_row_entries = alloc_entries;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Increment the cache statistics. */
+ __wt_cache_page_inmem_incr(session, page, size);
+ (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);
+
+ *pagep = page;
+ return (0);
+}
+
+/*
+ * __wt_page_inmem --
+ * Build in-memory page information.
+ */
+int
+__wt_page_inmem(WT_SESSION_IMPL *session,
+ WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ const WT_PAGE_HEADER *dsk;
+ uint32_t alloc_entries;
+ size_t size;
+
+ *pagep = NULL;
+
+ dsk = image;
+ alloc_entries = 0;
+
+ /*
+ * Figure out how many underlying objects the page references so we can
+ * allocate them along with the page.
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store leaf page entries map one-to-one to the number
+ * of physical entries on the page (each physical entry is a
+ * value item).
+ *
+ * Column-store internal page entries map one-to-one to the
+ * number of physical entries on the page (each entry is a
+ * location cookie).
+ */
+ alloc_entries = dsk->u.entries;
+ break;
+ case WT_PAGE_ROW_INT:
+ /*
+ * Row-store internal page entries map one-to-two to the number
+ * of physical entries on the page (each entry is a key and
+ * location cookie pair).
+ */
+ alloc_entries = dsk->u.entries / 2;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * If the "no empty values" flag is set, row-store leaf page
+ * entries map one-to-one to the number of physical entries
+ * on the page (each physical entry is a key or value item).
+ * If that flag is not set, there are more keys than values,
+ * we have to walk the page to figure it out.
+ */
+ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL))
+ alloc_entries = dsk->u.entries;
+ else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
+ alloc_entries = dsk->u.entries / 2;
+ else
+ WT_RET(__inmem_row_leaf_entries(
+ session, dsk, &alloc_entries));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Allocate and initialize a new WT_PAGE. */
+ WT_RET(__wt_page_alloc(
+ session, dsk->type, dsk->recno, alloc_entries, 1, &page));
+ page->dsk = dsk;
+ F_SET_ATOMIC(page, flags);
+
+ /*
+ * Track the memory allocated to build this page so we can update the
+ * cache statistics in a single call.
+ */
+ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ __inmem_col_fix(session, page);
+ break;
+ case WT_PAGE_COL_INT:
+ __inmem_col_int(session, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__inmem_col_var(session, page, &size));
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__inmem_row_int(session, page, &size));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__inmem_row_leaf(session, page));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Update the page's in-memory size and the cache statistics. */
+ __wt_cache_page_inmem_incr(session, page, size);
+
+ /* Link the new internal page to the parent. */
+ if (ref != NULL) {
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ page->pg_intl_parent_ref = ref;
+ break;
+ }
+ ref->page = page;
+ }
+
+ *pagep = page;
+ return (0);
+
+err: __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __inmem_col_fix --
+ * Build in-memory index for fixed-length column-store leaf pages.
+ */
+static void
+__inmem_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ const WT_PAGE_HEADER *dsk;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+
+ page->pg_fix_bitf = WT_PAGE_HEADER_BYTE(btree, dsk);
+}
+
+/*
+ * __inmem_col_int --
+ * Build in-memory index for column-store internal pages.
+ */
+static void
+__inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ WT_PAGE_INDEX *pindex;
+ WT_REF **refp, *ref;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /*
+ * Walk the page, building references: the page contains value items.
+ * The value items are on-page items (WT_CELL_VALUE).
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+ refp = pindex->index;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ref = *refp++;
+ ref->home = page;
+
+ __wt_cell_unpack(cell, unpack);
+ ref->addr = cell;
+ ref->key.recno = unpack->v;
+ }
+}
+
+/*
+ * __inmem_col_var_repeats --
+ * Count the number of repeat entries on the page.
+ */
+static int
+__inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /* Walk the page, counting entries for the repeats array. */
+ *np = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ if (__wt_cell_rle(unpack) > 1)
+ ++*np;
+ }
+ return (0);
+}
+
+/*
+ * __inmem_col_var --
+ * Build in-memory index for variable-length, data-only leaf pages in
+ * column-store trees.
+ */
+static int
+__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+{
+ WT_BTREE *btree;
+ WT_COL *cip;
+ WT_COL_RLE *repeats;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ uint64_t recno, rle;
+ size_t bytes_allocated;
+ uint32_t i, indx, n, repeat_off;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ recno = page->pg_var_recno;
+
+ repeats = NULL;
+ repeat_off = 0;
+ unpack = &_unpack;
+ bytes_allocated = 0;
+
+ /*
+ * Walk the page, building references: the page contains unsorted value
+ * items. The value items are on-page (WT_CELL_VALUE), overflow items
+ * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL).
+ */
+ indx = 0;
+ cip = page->pg_var_d;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell));
+ cip++;
+
+ /*
+ * Add records with repeat counts greater than 1 to an array we
+ * use for fast lookups. The first entry we find needing the
+ * repeats array triggers a re-walk from the start of the page
+ * to determine the size of the array.
+ */
+ rle = __wt_cell_rle(unpack);
+ if (rle > 1) {
+ if (repeats == NULL) {
+ WT_RET(
+ __inmem_col_var_repeats(session, page, &n));
+ WT_RET(__wt_realloc_def(session,
+ &bytes_allocated, n + 1, &repeats));
+
+ page->pg_var_repeats = repeats;
+ page->pg_var_nrepeats = n;
+ *sizep += bytes_allocated;
+ }
+ repeats[repeat_off].indx = indx;
+ repeats[repeat_off].recno = recno;
+ repeats[repeat_off++].rle = rle;
+ }
+ indx++;
+ recno += rle;
+ }
+
+ return (0);
+}
+
+/*
+ * __inmem_row_int --
+ * Build in-memory index for row-store internal pages.
+ */
+static int
+__inmem_row_int(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(current);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref, **refp;
+ uint32_t i;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ dsk = page->dsk;
+
+ WT_RET(__wt_scr_alloc(session, 0, &current));
+
+ /*
+ * Walk the page, instantiating keys: the page contains sorted key and
+ * location cookie pairs. Keys are on-page/overflow items and location
+ * cookies are WT_CELL_ADDR_XXX items.
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+ refp = pindex->index;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ref = *refp;
+ ref->home = page;
+
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ /*
+ * Note: we don't Huffman encode internal page keys,
+ * there's no decoding work to do.
+ */
+ __wt_ref_key_onpage_set(page, ref, unpack);
+ break;
+ case WT_CELL_KEY_OVFL:
+ /* Instantiate any overflow records. */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, page->type, unpack, current));
+
+ WT_ERR(__wt_row_ikey_incr(session, page,
+ WT_PAGE_DISK_OFFSET(page, cell),
+ current->data, current->size, &ref->key.ikey));
+
+ *sizep += sizeof(WT_IKEY) + current->size;
+ break;
+ case WT_CELL_ADDR_DEL:
+ /*
+ * A cell may reference a deleted leaf page: if a leaf
+ * page was deleted without being read (fast truncate),
+ * and the deletion committed, but older transactions
+ * in the system required the previous version of the
+ * page to remain available, a special deleted-address
+ * type cell is written. The only reason we'd ever see
+ * that cell on a page we're reading is if we crashed
+ * and recovered (otherwise a version of the page w/o
+ * that cell would have eventually been written). If we
+ * crash and recover to a page with a deleted-address
+ * cell, we want to discard the page from the backing
+ * store (it was never discarded), and, of course, by
+ * definition no earlier transaction will ever need it.
+ *
+ * Re-create the state of a deleted page.
+ */
+ ref->addr = cell;
+ ref->state = WT_REF_DELETED;
+ ++refp;
+
+ /*
+ * If the tree is already dirty and so will be written,
+ * mark the page dirty. (We want to free the deleted
+ * pages, but if the handle is read-only or if the
+ * application never modifies the tree, we're not able
+ * to do so.)
+ */
+ if (btree->modified) {
+ WT_ERR(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+ }
+ break;
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ ref->addr = cell;
+ ++refp;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+err: __wt_scr_free(&current);
+ return (ret);
+}
+
+/*
+ * __inmem_row_leaf_entries --
+ * Return the number of entries for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf_entries(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, uint32_t *nindxp)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t i, nindx;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+
+ /*
+ * Leaf row-store page entries map to a maximum of one-to-one to the
+ * number of physical entries on the page (each physical entry might be
+ * a key without a subsequent data item). To avoid over-allocation in
+ * workloads without empty data items, first walk the page counting the
+ * number of keys, then allocate the indices.
+ *
+ * The page contains key/data pairs. Keys are on-page (WT_CELL_KEY) or
+ * overflow (WT_CELL_KEY_OVFL) items, data are either non-existent or a
+ * single on-page (WT_CELL_VALUE) or overflow (WT_CELL_VALUE_OVFL) item.
+ */
+ nindx = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ ++nindx;
+ break;
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ }
+
+ *nindxp = nindx;
+ return (0);
+}
+
+/*
+ * __inmem_row_leaf --
+ * Build in-memory index for row-store leaf pages.
+ */
+static int
+__inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ const WT_PAGE_HEADER *dsk;
+ WT_ROW *rip;
+ uint32_t i;
+
+ btree = S2BT(session);
+ dsk = page->dsk;
+ unpack = &_unpack;
+
+ /* Walk the page, building indices. */
+ rip = page->pg_row_d;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY_OVFL:
+ __wt_row_leaf_key_set_cell(page, rip, cell);
+ ++rip;
+ break;
+ case WT_CELL_KEY:
+ /*
+ * Simple keys without compression (not Huffman encoded
+ * or prefix compressed), can be directly referenced on
+ * the page to avoid repeatedly unpacking their cells.
+ */
+ if (!btree->huffman_key && unpack->prefix == 0)
+ __wt_row_leaf_key_set(page, rip, unpack);
+ else
+ __wt_row_leaf_key_set_cell(page, rip, cell);
+ ++rip;
+ break;
+ case WT_CELL_VALUE:
+ /*
+ * Simple values without compression can be directly
+ * referenced on the page to avoid repeatedly unpacking
+ * their cells.
+ */
+ if (!btree->huffman_value)
+ __wt_row_leaf_value_set(page, rip - 1, unpack);
+ break;
+ case WT_CELL_VALUE_OVFL:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ }
+
+ /*
+ * We do not currently instantiate keys on leaf pages when the page is
+ * loaded, they're instantiated on demand.
+ */
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
new file mode 100644
index 00000000000..9cd6f8310af
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_read --
+ * Read a page from the file.
+ */
+int
+__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_ITEM tmp;
+ WT_PAGE *page;
+ WT_PAGE_STATE previous_state;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ page = NULL;
+
+ /*
+ * Don't pass an allocated buffer to the underlying block read function,
+ * force allocation of new memory of the appropriate size.
+ */
+ WT_CLEAR(tmp);
+
+ /*
+ * Attempt to set the state to WT_REF_READING for normal reads, or
+ * WT_REF_LOCKED, for deleted pages. If successful, we've won the
+ * race, read the page.
+ */
+ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
+ previous_state = WT_REF_DISK;
+ else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ previous_state = WT_REF_DELETED;
+ else
+ return (0);
+
+ /*
+ * Get the address: if there is no address, the page was deleted, but a
+ * subsequent search or insert is forcing re-creation of the name space.
+ * Otherwise, there's an address, read the backing disk page and build
+ * an in-memory version of the page.
+ */
+ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ if (addr == NULL) {
+ WT_ASSERT(session, previous_state == WT_REF_DELETED);
+
+ WT_ERR(__wt_btree_new_leaf_page(session, &page));
+ ref->page = page;
+ } else {
+ /* Read the backing disk page. */
+ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
+
+ /* Build the in-memory version of the page. */
+ WT_ERR(__wt_page_inmem(session, ref, tmp.data,
+ WT_DATA_IN_ITEM(&tmp) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+
+ /* If the page was deleted, instantiate that information. */
+ if (previous_state == WT_REF_DELETED)
+ WT_ERR(__wt_delete_page_instantiate(session, ref));
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_READ,
+ "page %p: %s", page, __wt_page_type_string(page->type)));
+
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+ return (0);
+
+err: /*
+ * If the function building an in-memory version of the page failed,
+ * it discarded the page, but not the disk image. Discard the page
+ * and separately discard the disk image in all cases.
+ */
+ if (ref->page != NULL)
+ __wt_ref_out(session, ref);
+ WT_PUBLISH(ref->state, previous_state);
+
+ __wt_buf_free(session, &tmp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
new file mode 100644
index 00000000000..25b4bfc3005
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -0,0 +1,116 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_kv_return --
+ * Return a page referenced key/value pair to the application.
+ */
+int
+__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK unpack;
+ WT_CURSOR *cursor;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ uint8_t v;
+
+ btree = S2BT(session);
+
+ page = cbt->ref->page;
+ cursor = &cbt->iface;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * The interface cursor's record has usually been set, but that
+ * isn't universally true, specifically, cursor.search_near may
+ * call here without first setting the interface cursor.
+ */
+ cursor->recno = cbt->recno;
+
+ /* If the cursor references a WT_UPDATE item, return it. */
+ if (upd != NULL) {
+ cursor->value.data = WT_UPDATE_DATA(upd);
+ cursor->value.size = upd->size;
+ return (0);
+ }
+
+ /* Take the value from the original page. */
+ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
+ return (__wt_buf_set(session, &cursor->value, &v, 1));
+ case WT_PAGE_COL_VAR:
+ /*
+ * The interface cursor's record has usually been set, but that
+ * isn't universally true, specifically, cursor.search_near may
+ * call here without first setting the interface cursor.
+ */
+ cursor->recno = cbt->recno;
+
+ /* If the cursor references a WT_UPDATE item, return it. */
+ if (upd != NULL) {
+ cursor->value.data = WT_UPDATE_DATA(upd);
+ cursor->value.size = upd->size;
+ return (0);
+ }
+
+ /* Take the value from the original page cell. */
+ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ rip = &page->pg_row_d[cbt->slot];
+
+ /*
+ * If the cursor references a WT_INSERT item, take its key.
+ * Else, if we have an exact match, we copied the key in the
+ * search function, take it from there.
+ * If we don't have an exact match, take the key from the
+ * original page.
+ */
+ if (cbt->ins != NULL) {
+ cursor->key.data = WT_INSERT_KEY(cbt->ins);
+ cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
+ } else if (cbt->compare == 0) {
+ cursor->key.data = cbt->search_key.data;
+ cursor->key.size = cbt->search_key.size;
+ } else
+ WT_RET(__wt_row_leaf_key(
+ session, page, rip, &cursor->key, 0));
+
+ /* If the cursor references a WT_UPDATE item, return it. */
+ if (upd != NULL) {
+ cursor->value.data = WT_UPDATE_DATA(upd);
+ cursor->value.size = upd->size;
+ return (0);
+ }
+
+ /* Simple values have their location encoded in the WT_ROW. */
+ if (__wt_row_leaf_value(page, rip, &cursor->value))
+ return (0);
+
+ /*
+ * Take the value from the original page cell (which may be
+ * empty).
+ */
+ if ((cell =
+ __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) {
+ cursor->value.size = 0;
+ return (0);
+ }
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* The value is an on-page cell, unpack and expand it as necessary. */
+ __wt_cell_unpack(cell, &unpack);
+ WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c
new file mode 100644
index 00000000000..10366e91a0e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c
@@ -0,0 +1,2520 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __wt_stuff; typedef struct __wt_stuff WT_STUFF;
+struct __wt_track; typedef struct __wt_track WT_TRACK;
+struct __wt_track_shared; typedef struct __wt_track_shared WT_TRACK_SHARED;
+
+/*
+ * There's a bunch of stuff we pass around during salvage, group it together
+ * to make the code prettier.
+ */
+struct __wt_stuff {
+ WT_SESSION_IMPL *session; /* Salvage session */
+
+ WT_TRACK **pages; /* Pages */
+ uint32_t pages_next; /* Next empty slot */
+ size_t pages_allocated; /* Bytes allocated */
+
+ WT_TRACK **ovfl; /* Overflow pages */
+ uint32_t ovfl_next; /* Next empty slot */
+ size_t ovfl_allocated; /* Bytes allocated */
+
+ WT_REF root_ref; /* Created root page */
+
+ uint8_t page_type; /* Page type */
+
+ /* If need to free blocks backing merged page ranges. */
+ int merge_free;
+
+ WT_ITEM *tmp1; /* Verbose print buffer */
+ WT_ITEM *tmp2; /* Verbose print buffer */
+
+ uint64_t fcnt; /* Progress counter */
+};
+
+/*
+ * WT_TRACK_SHARED --
+ * Information shared between pages being merged.
+ */
+struct __wt_track_shared {
+ uint32_t ref; /* Reference count */
+
+ /*
+ * Physical information about the file block.
+ */
+ WT_ADDR addr; /* Page address */
+ uint32_t size; /* Page size */
+ uint64_t gen; /* Page generation */
+
+ /*
+ * Pages that reference overflow pages contain a list of the overflow
+ * pages they reference. We start out with a list of addresses, and
+ * convert to overflow array slots during the reconciliation of page
+ * references to overflow records.
+ */
+ WT_ADDR *ovfl_addr; /* Overflow pages by address */
+ uint32_t *ovfl_slot; /* Overflow pages by slot */
+ uint32_t ovfl_cnt; /* Overflow reference count */
+};
+
+/*
+ * WT_TRACK --
+ * Structure to track chunks, one per chunk; we start out with a chunk per
+ * page (either leaf or overflow), but when we find overlapping key ranges, we
+ * split the leaf page chunks up, one chunk for each unique key range.
+ */
+struct __wt_track {
+#define trk_addr shared->addr.addr
+#define trk_addr_size shared->addr.size
+#define trk_gen shared->gen
+#define trk_ovfl_addr shared->ovfl_addr
+#define trk_ovfl_cnt shared->ovfl_cnt
+#define trk_ovfl_slot shared->ovfl_slot
+#define trk_size shared->size
+ WT_TRACK_SHARED *shared; /* Shared information */
+
+ WT_STUFF *ss; /* Enclosing stuff */
+
+ union {
+ struct {
+#undef row_start
+#define row_start u.row._row_start
+ WT_ITEM _row_start; /* Row-store start range */
+#undef row_stop
+#define row_stop u.row._row_stop
+ WT_ITEM _row_stop; /* Row-store stop range */
+ } row;
+
+ struct {
+#undef col_start
+#define col_start u.col._col_start
+ uint64_t _col_start; /* Col-store start range */
+#undef col_stop
+#define col_stop u.col._col_stop
+ uint64_t _col_stop; /* Col-store stop range */
+#undef col_missing
+#define col_missing u.col._col_missing
+ uint64_t _col_missing; /* Col-store missing range */
+ } col;
+ } u;
+
+#define WT_TRACK_CHECK_START 0x01 /* Row: initial key updated */
+#define WT_TRACK_CHECK_STOP 0x02 /* Row: last key updated */
+#define WT_TRACK_MERGE 0x04 /* Page requires merging */
+#define WT_TRACK_OVFL_REFD 0x08 /* Overflow page referenced */
+ u_int flags;
+};
+
+static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
+static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *);
+static int __slvg_col_ovfl(
+ WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t);
+static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_col_range_overlap(
+ WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
+static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *);
+static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_ovfl_compare(const void *, const void *);
+static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, int);
+static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *);
+static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
+static int __slvg_row_build_leaf(
+ WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *);
+static int __slvg_row_ovfl(
+ WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t);
+static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *);
+static int __slvg_row_range_overlap(
+ WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
+static int __slvg_row_trk_update_start(
+ WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *);
+static int __slvg_trk_compare_addr(const void *, const void *);
+static int __slvg_trk_compare_gen(const void *, const void *);
+static int __slvg_trk_compare_key(const void *, const void *);
+static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, int);
+static void __slvg_trk_free_addr(WT_SESSION_IMPL *, WT_TRACK *);
+static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *,
+ size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **);
+static int __slvg_trk_leaf(WT_SESSION_IMPL *,
+ const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
+static int __slvg_trk_leaf_ovfl(
+ WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *);
+static int __slvg_trk_ovfl(WT_SESSION_IMPL *,
+ const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
+static int __slvg_trk_split(WT_SESSION_IMPL *, WT_TRACK *, WT_TRACK **);
+
+/*
+ * __wt_bt_salvage --
+ * Salvage a Btree.
+ */
+int
+__wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_STUFF *ss, stuff;
+ uint32_t i, leaf_cnt;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ WT_CLEAR(stuff);
+ ss = &stuff;
+ ss->session = session;
+ ss->page_type = WT_PAGE_INVALID;
+
+ /* Allocate temporary buffers. */
+ WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2));
+
+ /*
+ * Step 1:
+ * Inform the underlying block manager that we're salvaging the file.
+ */
+ WT_ERR(bm->salvage_start(bm, session));
+
+ /*
+ * Step 2:
+ * Read the file and build in-memory structures that reference any leaf
+ * or overflow page. Any pages other than leaf or overflow pages are
+ * added to the free list.
+ *
+ * Turn off read checksum and verification error messages while we're
+ * reading the file, we expect to see corrupted blocks.
+ */
+ F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+ ret = __slvg_read(session, ss);
+ F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+ WT_ERR(ret);
+
+ /*
+ * Step 3:
+ * Discard any page referencing a non-existent overflow page. We do
+ * this before checking overlapping key ranges on the grounds that a
+ * bad key range we can use is better than a terrific key range that
+ * references pages we don't have. On the other hand, we subsequently
+ * discard key ranges where there are better overlapping ranges, and
+ * it would be better if we let the availability of an overflow value
+ * inform our choices as to the key ranges we select, ideally on a
+ * per-key basis.
+ *
+ * A complicating problem is found in variable-length column-store
+ * objects, where we potentially split key ranges within RLE units.
+ * For example, if there's a page with rows 15-20 and we later find
+ * row 17 with a larger LSN, the range splits into 3 chunks, 15-16,
+ * 17, and 18-20. If rows 15-20 were originally a single value (an
+ * RLE of 6), and that record is an overflow record, we end up with
+ * two chunks, both of which want to reference the same overflow value.
+ *
+ * Instead of the approach just described, we're first discarding any
+ * pages referencing non-existent overflow pages, then we're reviewing
+ * our key ranges and discarding any that overlap. We're doing it that
+ * way for a few reasons: absent corruption, missing overflow items are
+ * strong arguments the page was replaced (on the other hand, some kind
+ * of file corruption is probably why we're here); it's a significant
+ * amount of additional complexity to simultaneously juggle overlapping
+ * ranges and missing overflow items; finally, real-world applications
+ * usually don't have a lot of overflow items, as WiredTiger supports
+ * very large page sizes, overflow items shouldn't be common.
+ *
+ * Step 4:
+ * Add unreferenced overflow page blocks to the free list so they are
+ * reused immediately.
+ */
+ if (ss->ovfl_next != 0) {
+ WT_ERR(__slvg_ovfl_reconcile(session, ss));
+ WT_ERR(__slvg_ovfl_discard(session, ss));
+ }
+
+ /*
+ * Step 5:
+ * Walk the list of pages looking for overlapping ranges to resolve.
+ * If we find a range that needs to be resolved, set a global flag
+ * and a per WT_TRACK flag on the pages requiring modification.
+ *
+ * This requires sorting the page list by key, and secondarily by LSN.
+ *
+ * !!!
+ * It's vanishingly unlikely and probably impossible for fixed-length
+ * column-store files to have overlapping key ranges. It's possible
+ * for an entire key range to go missing (if a page is corrupted and
+ * lost), but because pages can't split, it shouldn't be possible to
+ * find pages where the key ranges overlap. That said, we check for
+ * it and clean up after it in reconciliation because it doesn't cost
+ * much and future column-store formats or operations might allow for
+ * fixed-length format ranges to overlap during salvage, and I don't
+ * want to have to retrofit the code later.
+ */
+ qsort(ss->pages,
+ (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key);
+ if (ss->page_type == WT_PAGE_ROW_LEAF)
+ WT_ERR(__slvg_row_range(session, ss));
+ else
+ WT_ERR(__slvg_col_range(session, ss));
+
+ /*
+ * Step 6:
+ * We may have lost key ranges in column-store databases, that is, some
+ * part of the record number space is gone. Look for missing ranges.
+ */
+ switch (ss->page_type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__slvg_col_range_missing(session, ss));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ break;
+ }
+
+ /*
+ * Step 7:
+ * Build an internal page that references all of the leaf pages,
+ * and write it, as well as any merged pages, to the file.
+ *
+ * Count how many leaf pages we have (we could track this during the
+ * array shuffling/splitting, but that's a lot harder).
+ */
+ for (leaf_cnt = i = 0; i < ss->pages_next; ++i)
+ if (ss->pages[i] != NULL)
+ ++leaf_cnt;
+ if (leaf_cnt != 0)
+ switch (ss->page_type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(
+ __slvg_col_build_internal(session, leaf_cnt, ss));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(
+ __slvg_row_build_internal(session, leaf_cnt, ss));
+ break;
+ }
+
+ /*
+ * Step 8:
+ * If we had to merge key ranges, we have to do a final pass through
+ * the leaf page array and discard file pages used during key merges.
+ * We can't do it earlier: if we free'd the leaf pages we're merging as
+ * we merged them, the write of subsequent leaf pages or the internal
+ * page might allocate those free'd file blocks, and if the salvage run
+ * subsequently fails, we'd have overwritten pages used to construct the
+ * final key range. In other words, if the salvage run fails, we don't
+ * want to overwrite data the next salvage run might need.
+ */
+ if (ss->merge_free)
+ WT_ERR(__slvg_merge_block_free(session, ss));
+
+ /*
+ * Step 9:
+ * Evict the newly created root page, creating a checkpoint.
+ */
+ if (ss->root_ref.page != NULL) {
+ btree->ckpt = ckptbase;
+ ret = __wt_rec_evict(session, &ss->root_ref, 1);
+ ss->root_ref.page = NULL;
+ btree->ckpt = NULL;
+ }
+
+ /*
+ * Step 10:
+ * Inform the underlying block manager that we're done.
+ */
+err: WT_TRET(bm->salvage_end(bm, session));
+
+ /* Discard any root page we created. */
+ if (ss->root_ref.page != NULL)
+ __wt_ref_out(session, &ss->root_ref);
+
+ /* Discard the leaf and overflow page memory. */
+ WT_TRET(__slvg_cleanup(session, ss));
+
+ /* Discard temporary buffers. */
+ __wt_scr_free(&ss->tmp1);
+ __wt_scr_free(&ss->tmp2);
+
+ /* Wrap up reporting. */
+ WT_TRET(__wt_progress(session, NULL, ss->fcnt));
+
+ return (ret);
+}
+
+/*
+ * __slvg_read --
+ * Read the file and build a table of the pages we can use.
+ */
+static int
+__slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_BM *bm;
+ WT_DECL_ITEM(as);
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ size_t addr_size;
+ uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int eof, valid;
+
+ bm = S2BT(session)->bm;
+ WT_ERR(__wt_scr_alloc(session, 0, &as));
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ for (;;) {
+ /* Get the next block address from the block manager. */
+ WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof));
+ if (eof)
+ break;
+
+ /* Report progress every 10 chunks. */
+ if (++ss->fcnt % 10 == 0)
+ WT_ERR(__wt_progress(session, NULL, ss->fcnt));
+
+ /*
+ * Read (and potentially decompress) the block; the underlying
+ * block manager might return only good blocks if checksums are
+ * configured, or both good and bad blocks if we're relying on
+ * compression.
+ *
+ * Report the block's status to the block manager.
+ */
+ if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0)
+ valid = 1;
+ else {
+ valid = 0;
+ if (ret == WT_ERROR)
+ ret = 0;
+ WT_ERR(ret);
+ }
+ WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid));
+ if (!valid)
+ continue;
+
+ /* Create a printable version of the address. */
+ WT_ERR(bm->addr_string(bm, session, as, addr, addr_size));
+
+ /*
+ * Make sure it's an expected page type for the file.
+ *
+ * We only care about leaf and overflow pages from here on out;
+ * discard all of the others. We put them on the free list now,
+ * because we might as well overwrite them, we want the file to
+ * grow as little as possible, or shrink, and future salvage
+ * calls don't need them either.
+ */
+ dsk = buf->data;
+ switch (dsk->type) {
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s page ignored %s",
+ __wt_page_type_string(dsk->type),
+ (const char *)as->data));
+ WT_ERR(bm->free(bm, session, addr, addr_size));
+ continue;
+ }
+
+ /*
+ * Verify the page. It's unlikely a page could have a valid
+ * checksum and still be broken, but paranoia is healthy in
+ * salvage. Regardless, verify does return failure because
+ * it detects failures we'd expect to see in a corrupted file,
+ * like overflow references past the end of the file or
+ * overflow references to non-existent pages, might as well
+ * discard these pages now.
+ */
+ if (__wt_verify_dsk(session, as->data, buf) != 0) {
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s page failed verify %s",
+ __wt_page_type_string(dsk->type),
+ (const char *)as->data));
+ WT_ERR(bm->free(bm, session, addr, addr_size));
+ continue;
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "tracking %s page, generation %" PRIu64 " %s",
+ __wt_page_type_string(dsk->type), dsk->write_gen,
+ (const char *)as->data));
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ if (ss->page_type == WT_PAGE_INVALID)
+ ss->page_type = dsk->type;
+ if (ss->page_type != dsk->type)
+ WT_ERR_MSG(session, WT_ERROR,
+ "file contains multiple file formats (both "
+ "%s and %s), and cannot be salvaged",
+ __wt_page_type_string(ss->page_type),
+ __wt_page_type_string(dsk->type));
+
+ WT_ERR(__slvg_trk_leaf(
+ session, dsk, addr, addr_size, ss));
+ break;
+ case WT_PAGE_OVFL:
+ WT_ERR(__slvg_trk_ovfl(
+ session, dsk, addr, addr_size, ss));
+ break;
+ }
+ }
+
+err: __wt_scr_free(&as);
+ __wt_scr_free(&buf);
+
+ return (ret);
+}
+
+/*
+ * __slvg_trk_init --
+ * Initialize tracking information for a page.
+ */
+static int
+__slvg_trk_init(WT_SESSION_IMPL *session,
+ uint8_t *addr, size_t addr_size,
+ uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp)
+{
+ WT_DECL_RET;
+ WT_TRACK *trk;
+
+ WT_RET(__wt_calloc_def(session, 1, &trk));
+ WT_ERR(__wt_calloc_def(session, 1, &trk->shared));
+ trk->shared->ref = 1;
+
+ trk->ss = ss;
+ WT_ERR(__wt_strndup(session, addr, addr_size, &trk->trk_addr));
+ trk->trk_addr_size = (uint8_t)addr_size;
+ trk->trk_size = size;
+ trk->trk_gen = gen;
+
+ *retp = trk;
+ return (0);
+
+err: __wt_free(session, trk->trk_addr);
+ __wt_free(session, trk->shared);
+ __wt_free(session, trk);
+ return (ret);
+}
+
+/*
+ * __slvg_trk_split --
+ * Split a tracked chunk.
+ */
+static int
+__slvg_trk_split(WT_SESSION_IMPL *session, WT_TRACK *orig, WT_TRACK **newp)
+{
+ WT_TRACK *trk;
+
+ WT_RET(__wt_calloc_def(session, 1, &trk));
+
+ trk->shared = orig->shared;
+ trk->ss = orig->ss;
+
+ ++orig->shared->ref;
+
+ *newp = trk;
+ return (0);
+}
+
+/*
+ * __slvg_trk_leaf --
+ * Track a leaf page.
+ */
+static int
+__slvg_trk_leaf(WT_SESSION_IMPL *session,
+ const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_TRACK *trk;
+ uint64_t stop_recno;
+ uint32_t i;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ page = NULL;
+ trk = NULL;
+
+ /* Re-allocate the array of pages, as necessary. */
+ WT_RET(__wt_realloc_def(
+ session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+
+ /* Allocate a WT_TRACK entry for this new page and fill it in. */
+ WT_RET(__slvg_trk_init(
+ session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
+
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * Column-store fixed-sized format: start and stop keys can be
+ * taken from the block's header, and doesn't contain overflow
+ * items.
+ */
+ trk->col_start = dsk->recno;
+ trk->col_stop = dsk->recno + (dsk->u.entries - 1);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s records %" PRIu64 "-%" PRIu64,
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ trk->col_start, trk->col_stop));
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store variable-length format: the start key can be
+ * taken from the block's header, stop key requires walking
+ * the page.
+ */
+ stop_recno = dsk->recno;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ stop_recno += __wt_cell_rle(unpack);
+ }
+
+ trk->col_start = dsk->recno;
+ trk->col_stop = stop_recno - 1;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s records %" PRIu64 "-%" PRIu64,
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ trk->col_start, trk->col_stop));
+
+ /* Column-store pages can contain overflow items. */
+ WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Row-store format: copy the first and last keys on the page.
+ * Keys are prefix-compressed, the simplest and slowest thing
+ * to do is instantiate the in-memory page, then instantiate
+ * and copy the full keys, then free the page. We do this
+ * on every leaf page, and if you need to speed up the salvage,
+ * it's probably a great place to start.
+ */
+ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page));
+ WT_ERR(__wt_row_leaf_key_copy(session,
+ page, &page->pg_row_d[0], &trk->row_start));
+ WT_ERR(__wt_row_leaf_key_copy(session, page,
+ &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop));
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+ WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
+ trk->row_start.data, trk->row_start.size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s start key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp2),
+ (int)ss->tmp1->size, (char *)ss->tmp1->data));
+ WT_ERR(__wt_buf_set_printable(session, ss->tmp1,
+ trk->row_stop.data, trk->row_stop.size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s stop key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp2),
+ (int)ss->tmp1->size, (char *)ss->tmp1->data));
+ }
+
+ /* Row-store pages can contain overflow items. */
+ WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
+ break;
+ }
+ ss->pages[ss->pages_next++] = trk;
+
+ if (0) {
+err: __wt_free(session, trk);
+ }
+ if (page != NULL)
+ __wt_page_out(session, &page);
+ return (ret);
+}
+
+/*
+ * __slvg_trk_ovfl --
+ * Track an overflow page.
+ */
+static int
+__slvg_trk_ovfl(WT_SESSION_IMPL *session,
+ const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+
+ /*
+ * Reallocate the overflow page array as necessary, then save the
+ * page's location information.
+ */
+ WT_RET(__wt_realloc_def(
+ session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl));
+
+ WT_RET(__slvg_trk_init(
+ session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
+ ss->ovfl[ss->ovfl_next++] = trk;
+
+ return (0);
+}
+
+/*
+ * __slvg_trk_leaf_ovfl --
+ * Search a leaf page for overflow items.
+ */
+static int
+__slvg_trk_leaf_ovfl(
+ WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t i, ovfl_cnt;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+
+ /*
+ * Two passes: count the overflow items, then copy them into an
+ * allocated array.
+ */
+ ovfl_cnt = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->ovfl)
+ ++ovfl_cnt;
+ }
+ if (ovfl_cnt == 0)
+ return (0);
+
+ /* Allocate room for the array of overflow addresses and fill it in. */
+ WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr));
+ trk->trk_ovfl_cnt = ovfl_cnt;
+
+ ovfl_cnt = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->ovfl) {
+ WT_RET(__wt_strndup(session, unpack->data,
+ unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr));
+ trk->trk_ovfl_addr[ovfl_cnt].size =
+ (uint8_t)unpack->size;
+
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s overflow reference %s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+ __wt_addr_string(session,
+ unpack->data, unpack->size, trk->ss->tmp2)));
+
+ if (++ovfl_cnt == trk->trk_ovfl_cnt)
+ break;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __slvg_col_range --
+ * Figure out the leaf pages we need and free the leaf pages we don't.
+ *
+ * When pages split, the key range is split across multiple pages. If not all
+ * of the old versions of the page are overwritten, or not all of the new pages
+ * are written, or some of the pages are corrupted, salvage will read different
+ * pages with overlapping key ranges, at different LSNs.
+ *
+ * We salvage all of the key ranges we find, at the latest LSN value: this means
+ * we may resurrect pages of deleted items, as page deletion doesn't write leaf
+ * pages and salvage will read and instantiate the contents of an old version of
+ * the deleted page.
+ *
+ * The leaf page array is sorted in key order, and secondarily on LSN: what this
+ * means is that for each new key range, the first page we find is the best page
+ * for that key. The process is to walk forward from each page until we reach
+ * a page with a starting key after the current page's stopping key.
+ *
+ * For each of page, check to see if they overlap the current page's key range.
+ * If they do, resolve the overlap. Because WiredTiger rarely splits pages,
+ * overlap resolution usually means discarding a page because the key ranges
+ * are the same, and one of the pages is simply an old version of the other.
+ *
+ * However, it's possible more complex resolution is necessary. For example,
+ * here's an improbably complex list of page ranges and LSNs:
+ *
+ * Page Range LSN
+ * 30 A-G 3
+ * 31 C-D 4
+ * 32 B-C 5
+ * 33 C-F 6
+ * 34 C-D 7
+ * 35 F-M 8
+ * 36 H-O 9
+ *
+ * We walk forward from each page reviewing all other pages in the array that
+ * overlap the range. For each overlap, the current or the overlapping
+ * page is updated so the page with the most recent information for any range
+ * "owns" that range. Here's an example for page 30.
+ *
+ * Review page 31: because page 31 has the range C-D and a higher LSN than page
+ * 30, page 30 would "split" into two ranges, A-C and E-G, conceding the C-D
+ * range to page 31. The new track element would be inserted into array with
+ * the following result:
+ *
+ * Page Range LSN
+ * 30 A-C 3 << Changed WT_TRACK element
+ * 31 C-D 4
+ * 32 B-C 5
+ * 33 C-F 6
+ * 34 C-D 7
+ * 30 E-G 3 << New WT_TRACK element
+ * 35 F-M 8
+ * 36 H-O 9
+ *
+ * Continue the review of the first element, using its new values.
+ *
+ * Review page 32: because page 31 has the range B-C and a higher LSN than page
+ * 30, page 30's A-C range would be truncated, conceding the B-C range to page
+ * 32.
+ * 30 A-B 3
+ * E-G 3
+ * 31 C-D 4
+ * 32 B-C 5
+ * 33 C-F 6
+ * 34 C-D 7
+ *
+ * Review page 33: because page 33 has a starting key (C) past page 30's ending
+ * key (B), we stop evaluating page 30's A-B range, as there can be no further
+ * overlaps.
+ *
+ * This process is repeated for each page in the array.
+ *
+ * When page 33 is processed, we'd discover that page 33's C-F range overlaps
+ * page 30's E-G range, and page 30's E-G range would be updated, conceding the
+ * E-F range to page 33.
+ *
+ * This is not computationally expensive because we don't walk far forward in
+ * the leaf array because it's sorted by starting key, and because WiredTiger
+ * splits are rare, the chance of finding the kind of range overlap requiring
+ * re-sorting the array is small.
+ */
+static int
+__slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *jtrk;
+ uint32_t i, j;
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ *
+ * Walk the page array looking for overlapping key ranges, adjusting
+ * the ranges based on the LSN until there are no overlaps.
+ *
+ * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
+ * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
+ * PLUS OFFSET.
+ */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+
+ /* Check for pages that overlap our page. */
+ for (j = i + 1; j < ss->pages_next; ++j) {
+ if (ss->pages[j] == NULL)
+ continue;
+ /*
+ * We're done if this page starts after our stop, no
+ * subsequent pages can overlap our page.
+ */
+ if (ss->pages[j]->col_start >
+ ss->pages[i]->col_stop)
+ break;
+
+ /* There's an overlap, fix it up. */
+ jtrk = ss->pages[j];
+ WT_RET(__slvg_col_range_overlap(session, i, j, ss));
+
+ /*
+ * If the overlap resolution changed the entry's start
+ * key, the entry might have moved and the page array
+ * re-sorted, and pages[j] would reference a different
+ * page. We don't move forward if that happened, we
+ * re-process the slot again (by decrementing j before
+ * the loop's increment).
+ */
+ if (ss->pages[j] != NULL && jtrk != ss->pages[j])
+ --j;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __slvg_col_range_overlap --
+ * Two column-store key ranges overlap, deal with it.
+ */
+static int
+__slvg_col_range_overlap(
+ WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
+{
+ WT_TRACK *a_trk, *b_trk, *new;
+ uint32_t i;
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ */
+ a_trk = ss->pages[a_slot];
+ b_trk = ss->pages[b_slot];
+
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s range overlap",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+
+ /*
+ * The key ranges of two WT_TRACK pages in the array overlap -- choose
+ * the ranges we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB pages are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ * Note the leaf page array was sorted by key and a_trk appears earlier
+ * in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
+ *
+ * Finally, there's one additional complicating factor -- final ranges
+ * are assigned based on the page's LSN.
+ */
+ /* Case #2/8, #10, #11 */
+ if (a_trk->col_start > b_trk->col_start)
+ WT_PANIC_RET(
+ session, EINVAL, "unexpected merge array sort order");
+
+ if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */
+ /*
+ * The secondary sort of the leaf page array was the page's LSN,
+ * in high-to-low order, which means a_trk has a higher LSN, and
+ * is more desirable, than b_trk. In cases #1 and #4 and #9,
+ * where the start of the range is the same for the two pages,
+ * this simplifies things, it guarantees a_trk has a higher LSN
+ * than b_trk.
+ */
+ if (a_trk->col_stop >= b_trk->col_stop)
+ /*
+ * Case #1, #4: a_trk is a superset of b_trk, and a_trk
+ * is more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #9: b_trk is a superset of a_trk, but a_trk is more
+ * desirable: keep both but delete a_trk's key range from
+ * b_trk.
+ */
+ b_trk->col_start = a_trk->col_stop + 1;
+ __slvg_col_trk_update_start(b_slot, ss);
+ F_SET(b_trk, WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (a_trk->col_stop == b_trk->col_stop) { /* Case #6 */
+ if (a_trk->trk_gen > b_trk->trk_gen)
+ /*
+ * Case #6: a_trk is a superset of b_trk and a_trk is
+ * more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #6: a_trk is a superset of b_trk, but b_trk is more
+ * desirable: keep both but delete b_trk's key range from a_trk.
+ */
+ a_trk->col_stop = b_trk->col_start - 1;
+ F_SET(a_trk, WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (a_trk->col_stop < b_trk->col_stop) { /* Case #3/7 */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+ /*
+ * Case #3/7: a_trk is more desirable, delete a_trk's
+ * key range from b_trk;
+ */
+ b_trk->col_start = a_trk->col_stop + 1;
+ __slvg_col_trk_update_start(b_slot, ss);
+ F_SET(b_trk, WT_TRACK_MERGE);
+ } else {
+ /*
+ * Case #3/7: b_trk is more desirable, delete b_trk's
+ * key range from a_trk;
+ */
+ a_trk->col_stop = b_trk->col_start - 1;
+ F_SET(a_trk, WT_TRACK_MERGE);
+ }
+ goto merge;
+ }
+
+ /*
+ * Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
+ * discard b_trk.
+ */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+delete_b: /*
+ * After page and overflow reconciliation, one (and only one)
+ * page can reference an overflow record. But, if we split a
+ * page into multiple chunks, any of the chunks might own any
+ * of the backing overflow records, so overflow records won't
+ * normally be discarded until after the merge phase completes.
+ * (The merge phase is where the final pages are written, and
+ * we figure out which overflow records are actually used.)
+ * If freeing a chunk and there are no other references to the
+ * underlying shared information, the overflow records must be
+ * useless, discard them to keep the final file size small.
+ */
+ if (b_trk->shared->ref == 1)
+ for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
+ WT_RET(__slvg_trk_free(session,
+ &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1));
+ return (__slvg_trk_free(session, &ss->pages[b_slot], 1));
+ }
+
+ /*
+ * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
+ * Split a_trk into two parts, the key range before b_trk and the
+ * key range after b_trk.
+ */
+ WT_RET(__slvg_trk_split(session, a_trk, &new));
+
+ /*
+ * Second, reallocate the array of pages if necessary, and then insert
+ * the new element into the array after the existing element (that's
+ * probably wrong, but we'll fix it up in a second).
+ */
+ WT_RET(__wt_realloc_def(
+ session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+ memmove(ss->pages + a_slot + 1, ss->pages + a_slot,
+ (ss->pages_next - a_slot) * sizeof(*ss->pages));
+ ss->pages[a_slot + 1] = new;
+ ++ss->pages_next;
+
+ /*
+ * Third, set its start key to be the first key after the stop key of
+ * the middle chunk (that's b_trk), and its stop key to be the stop key
+ * of the original chunk, and call __slvg_col_trk_update_start. That
+ * function will re-sort the WT_TRACK array as necessary to move our
+ * new entry into the right sorted location.
+ */
+ new->col_start = b_trk->col_stop + 1;
+ new->col_stop = a_trk->col_stop;
+ __slvg_col_trk_update_start(a_slot + 1, ss);
+
+ /*
+ * Fourth, set the original WT_TRACK information to reference only
+ * the initial key space in the page, that is, everything up to the
+ * starting key of the middle chunk (that's b_trk).
+ */
+ a_trk->col_stop = b_trk->col_start - 1;
+
+ F_SET(new, WT_TRACK_MERGE);
+ F_SET(a_trk, WT_TRACK_MERGE);
+
+merge: WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s require merge",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+ return (0);
+}
+
+/*
+ * __slvg_col_trk_update_start --
+ * Update a column-store page's start key after an overlap.
+ */
+static void
+__slvg_col_trk_update_start(uint32_t slot, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint32_t i;
+
+ trk = ss->pages[slot];
+
+ /*
+ * If we deleted an initial piece of the WT_TRACK name space, it may no
+ * longer be in the right location.
+ *
+ * For example, imagine page #1 has the key range 30-50, it split, and
+ * we wrote page #2 with key range 30-40, and page #3 key range with
+ * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the
+ * key ranges were sorted, page #2 came first, then page #1 (because of
+ * their earlier start keys than page #3), and page #2 came before page
+ * #1 because of its LSN. When we resolve the overlap between page #2
+ * and page #1, we truncate the initial key range of page #1, and it now
+ * sorts after page #3, because it has the same starting key of 40, and
+ * a lower LSN.
+ *
+ * We have already updated b_trk's start key; what we may have to do is
+ * re-sort some number of elements in the list.
+ */
+ for (i = slot + 1; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+ if (ss->pages[i]->col_start > trk->col_stop)
+ break;
+ }
+ i -= slot;
+ if (i > 1)
+ qsort(ss->pages + slot, (size_t)i,
+ sizeof(WT_TRACK *), __slvg_trk_compare_key);
+}
+
+/*
+ * __slvg_col_range_missing --
+ * Detect missing ranges from column-store files.
+ */
+static int
+__slvg_col_range_missing(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint64_t r;
+ uint32_t i;
+
+ for (i = 0, r = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+ if (trk->col_start != r + 1) {
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s column-store missing range from %"
+ PRIu64 " to %" PRIu64 " inclusive",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ r + 1, trk->col_start - 1));
+
+ /*
+ * We need to instantiate deleted items for the missing
+ * record range.
+ */
+ trk->col_missing = r + 1;
+ F_SET(trk, WT_TRACK_MERGE);
+ }
+ r = trk->col_stop;
+ }
+ return (0);
+}
+
+/*
+ * __slvg_modify_init --
+ * Initialize a salvage page's modification information.
+ */
+static int
+__slvg_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_RET(__wt_page_modify_init(session, page));
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+/*
+ * __slvg_col_build_internal --
+ * Build a column-store in-memory page that references all of the leaf
+ * pages we've found.
+ */
+static int
+__slvg_col_build_internal(
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref, **refp;
+ WT_TRACK *trk;
+ uint32_t i;
+
+ addr = NULL;
+
+ /* Allocate a column-store root (internal) page and fill it in. */
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_COL_INT, 1, leaf_cnt, 1, &page));
+ WT_ERR(__slvg_modify_init(session, page));
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+
+ ref = *refp++;
+ ref->home = page;
+ ref->page = NULL;
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_ERR(__wt_strndup(
+ session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
+ addr->size = trk->trk_addr_size;
+ addr->type =
+ trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF;
+ ref->addr = addr;
+ addr = NULL;
+
+ ref->key.recno = trk->col_start;
+ ref->state = WT_REF_DISK;
+
+ /*
+ * If the page's key range is unmodified from when we read it
+ * (in other words, we didn't merge part of this page with
+ * another page), we can use the page without change, and the
+ * only thing we need to do is mark all overflow records the
+ * page references as in-use.
+ *
+ * If we did merge with another page, we have to build a page
+ * reflecting the updated key range. Note, that requires an
+ * additional pass to free the merge page's backing blocks.
+ */
+ if (F_ISSET(trk, WT_TRACK_MERGE)) {
+ ss->merge_free = 1;
+
+ WT_ERR(__slvg_col_build_leaf(session, trk, ref));
+ } else
+ WT_ERR(__slvg_ovfl_ref_all(session, trk));
+ ++ref;
+ }
+
+ __wt_root_ref_init(&ss->root_ref, page, 1);
+
+ if (0) {
+err: if (addr != NULL)
+ __wt_free(session, addr);
+ __wt_page_out(session, &page);
+ }
+ return (ret);
+}
+
+/*
+ * __slvg_col_build_leaf --
+ * Build a column-store leaf page for a merged page.
+ */
+static int
+__slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
+{
+ WT_COL *save_col_var;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_SALVAGE_COOKIE *cookie, _cookie;
+ uint64_t skip, take;
+ uint32_t *entriesp, save_entries;
+
+ cookie = &_cookie;
+ WT_CLEAR(*cookie);
+
+ /* Get the original page, including the full in-memory setup. */
+ WT_RET(__wt_page_in(session, ref, 0));
+ page = ref->page;
+
+ entriesp = page->type == WT_PAGE_COL_VAR ?
+ &page->pg_var_entries : &page->pg_fix_entries;
+
+ save_col_var = page->pg_var_d;
+ save_entries = *entriesp;
+
+ /*
+ * Calculate the number of K/V entries we are going to skip, and
+ * the total number of K/V entries we'll take from this page.
+ */
+ cookie->skip = skip = trk->col_start - page->pg_var_recno;
+ cookie->take = take = (trk->col_stop - trk->col_start) + 1;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding first %" PRIu64 " records, "
+ "then taking %" PRIu64 " records",
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+ skip, take));
+
+ /* Set the referenced flag on overflow pages we're using. */
+ if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0)
+ WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take));
+
+ /*
+ * If we're missing some part of the range, the real start range is in
+ * trk->col_missing, else, it's in trk->col_start. Update the parent's
+ * reference as well as the page itself.
+ */
+ if (trk->col_missing == 0)
+ page->pg_var_recno = trk->col_start;
+ else {
+ page->pg_var_recno = trk->col_missing;
+ cookie->missing = trk->col_start - trk->col_missing;
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge inserting %" PRIu64 " missing records",
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
+ cookie->missing));
+ }
+ ref->key.recno = page->pg_var_recno;
+
+ /*
+ * We can't discard the original blocks associated with this page now.
+ * (The problem is we don't want to overwrite any original information
+ * until the salvage run succeeds -- if we free the blocks now, the next
+ * merge page we write might allocate those blocks and overwrite them,
+ * and should the salvage run eventually fail, the original information
+ * would have been lost.) Clear the reference addr so eviction doesn't
+ * free the underlying blocks.
+ */
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ ref->addr = NULL;
+
+ /* Write the new version of the leaf page to disk. */
+ WT_ERR(__slvg_modify_init(session, page));
+ WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+
+ /* Reset the page. */
+ page->pg_var_d = save_col_var;
+ *entriesp = save_entries;
+
+ ret = __wt_page_release(session, ref, 0);
+ if (ret == 0)
+ ret = __wt_rec_evict(session, ref, 1);
+
+ if (0) {
+err: WT_TRET(__wt_page_release(session, ref, 0));
+ }
+
+ return (ret);
+}
+
+/*
+ * __slvg_col_ovfl_single --
+ * Find a single overflow record in the merge page's list, and mark it as
+ * referenced.
+ */
+static int
+__slvg_col_ovfl_single(
+ WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack)
+{
+ WT_TRACK *ovfl;
+ uint32_t i;
+
+ /*
+ * Search the list of overflow records for this page -- we should find
+ * exactly one match, and we mark it as referenced.
+ */
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i) {
+ ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]];
+ if (unpack->size == ovfl->trk_addr_size &&
+ memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0)
+ return (__slvg_ovfl_ref(session, ovfl, 0));
+ }
+
+ WT_PANIC_RET(session,
+ EINVAL, "overflow record at column-store page merge not found");
+}
+
+/*
+ * __slvg_col_ovfl --
+ * Mark overflow items referenced by the merged page.
+ */
+static int
+__slvg_col_ovfl(WT_SESSION_IMPL *session,
+ WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take)
+{
+ WT_CELL_UNPACK unpack;
+ WT_CELL *cell;
+ WT_COL *cip;
+ WT_DECL_RET;
+ uint64_t recno, start, stop;
+ uint32_t i;
+
+ /*
+ * Merging a variable-length column-store page, and we took some number
+ * of records, figure out which (if any) overflow records we used.
+ */
+ recno = page->pg_var_recno;
+ start = recno + skip;
+ stop = (recno + skip + take) - 1;
+
+ WT_COL_FOREACH(page, cip, i) {
+ cell = WT_COL_PTR(page, cip);
+ __wt_cell_unpack(cell, &unpack);
+ recno += __wt_cell_rle(&unpack);
+
+ /*
+ * I keep getting this calculation wrong, so here's the logic.
+ * Start is the first record we want, stop is the last record
+ * we want. The record number has already been incremented one
+ * past the maximum record number for this page entry, that is,
+ * it's set to the first record number for the next page entry.
+ * The test of start should be greater-than (not greater-than-
+ * or-equal), because of that increment, if the record number
+ * equals start, we want the next record, not this one. The
+ * test against stop is greater-than, not greater-than-or-equal
+ * because stop is the last record wanted, if the record number
+ * equals stop, we want the next record.
+ */
+ if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) {
+ ret = __slvg_col_ovfl_single(session, trk, &unpack);
+
+ /*
+ * When handling overlapping ranges on variable-length
+ * column-store leaf pages, we split ranges without
+ * considering if we were splitting RLE units. (See
+ * note at the beginning of this file for explanation
+ * of the overall process.) If the RLE unit was on-page,
+ * we can simply write it again. If the RLE unit was an
+ * overflow value that's already been used by another
+ * row (from some other page created by a range split),
+ * there's not much to do, this row can't reference an
+ * overflow record we don't have: delete the row.
+ */
+ if (ret == EBUSY) {
+ __wt_cell_type_reset(session,
+ cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL);
+ ret = 0;
+ }
+ WT_RET(ret);
+ }
+ if (recno > stop)
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __slvg_row_range --
+ * Figure out the leaf pages we need and discard everything else. At the
+ * same time, tag the overflow pages they reference.
+ */
+static int
+__slvg_row_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *jtrk;
+ WT_BTREE *btree;
+ uint32_t i, j;
+ int cmp;
+
+ btree = S2BT(session);
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ *
+ * Walk the page array looking for overlapping key ranges, adjusting
+ * the ranges based on the LSN until there are no overlaps.
+ *
+ * DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
+ * AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
+ * PLUS OFFSET.
+ */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+
+ /* Check for pages that overlap our page. */
+ for (j = i + 1; j < ss->pages_next; ++j) {
+ if (ss->pages[j] == NULL)
+ continue;
+ /*
+ * We're done if this page starts after our stop, no
+ * subsequent pages can overlap our page.
+ */
+ WT_RET(__wt_compare(session, btree->collator,
+ &ss->pages[j]->row_start, &ss->pages[i]->row_stop,
+ &cmp));
+ if (cmp > 0)
+ break;
+
+ /* There's an overlap, fix it up. */
+ jtrk = ss->pages[j];
+ WT_RET(__slvg_row_range_overlap(session, i, j, ss));
+
+ /*
+ * If the overlap resolution changed the entry's start
+ * key, the entry might have moved and the page array
+ * re-sorted, and pages[j] would reference a different
+ * page. We don't move forward if that happened, we
+ * re-process the slot again (by decrementing j before
+ * the loop's increment).
+ */
+ if (ss->pages[j] != NULL && jtrk != ss->pages[j])
+ --j;
+ }
+ }
+ return (0);
+}
+
+/*
+ * __slvg_row_range_overlap --
+ * Two row-store key ranges overlap, deal with it.
+ */
+static int
+__slvg_row_range_overlap(
+ WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_TRACK *a_trk, *b_trk, *new;
+ uint32_t i;
+ int start_cmp, stop_cmp;
+
+ /*
+ * DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
+ * COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
+ * BEING HANDLED.
+ */
+ btree = S2BT(session);
+
+ a_trk = ss->pages[a_slot];
+ b_trk = ss->pages[b_slot];
+
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s range overlap",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+
+ /*
+ * The key ranges of two WT_TRACK pages in the array overlap -- choose
+ * the ranges we're going to take from each.
+ *
+ * We can think of the overlap possibilities as 11 different cases:
+ *
+ * AAAAAAAAAAAAAAAAAA
+ * #1 BBBBBBBBBBBBBBBBBB pages are the same
+ * #2 BBBBBBBBBBBBB overlaps the beginning
+ * #3 BBBBBBBBBBBBBBBB overlaps the end
+ * #4 BBBBB B is a prefix of A
+ * #5 BBBBBB B is middle of A
+ * #6 BBBBBBBBBB B is a suffix of A
+ *
+ * and:
+ *
+ * BBBBBBBBBBBBBBBBBB
+ * #7 AAAAAAAAAAAAA same as #3
+ * #8 AAAAAAAAAAAAAAAA same as #2
+ * #9 AAAAA A is a prefix of B
+ * #10 AAAAAA A is middle of B
+ * #11 AAAAAAAAAA A is a suffix of B
+ *
+ * Note the leaf page array was sorted by key and a_trk appears earlier
+ * in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
+ *
+ * Finally, there's one additional complicating factor -- final ranges
+ * are assigned based on the page's LSN.
+ */
+#define A_TRK_START (&a_trk->row_start)
+#define A_TRK_STOP (&a_trk->row_stop)
+#define B_TRK_START (&b_trk->row_start)
+#define B_TRK_STOP (&b_trk->row_stop)
+#define SLOT_START(i) (&ss->pages[i]->row_start)
+#define __slvg_key_copy(session, dst, src) \
+ __wt_buf_set(session, dst, (src)->data, (src)->size)
+
+ WT_RET(__wt_compare(
+ session, btree->collator, A_TRK_START, B_TRK_START, &start_cmp));
+ WT_RET(__wt_compare(
+ session, btree->collator, A_TRK_STOP, B_TRK_STOP, &stop_cmp));
+
+ if (start_cmp > 0) /* Case #2/8, #10, #11 */
+ WT_PANIC_RET(
+ session, EINVAL, "unexpected merge array sort order");
+
+ if (start_cmp == 0) { /* Case #1, #4, #9 */
+ /*
+ * The secondary sort of the leaf page array was the page's LSN,
+ * in high-to-low order, which means a_trk has a higher LSN, and
+ * is more desirable, than b_trk. In cases #1 and #4 and #9,
+ * where the start of the range is the same for the two pages,
+ * this simplifies things, it guarantees a_trk has a higher LSN
+ * than b_trk.
+ */
+ if (stop_cmp >= 0)
+ /*
+ * Case #1, #4: a_trk is a superset of b_trk, and a_trk
+ * is more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #9: b_trk is a superset of a_trk, but a_trk is more
+ * desirable: keep both but delete a_trk's key range from
+ * b_trk.
+ */
+ WT_RET(__slvg_row_trk_update_start(
+ session, A_TRK_STOP, b_slot, ss));
+ F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (stop_cmp == 0) { /* Case #6 */
+ if (a_trk->trk_gen > b_trk->trk_gen)
+ /*
+ * Case #6: a_trk is a superset of b_trk and a_trk is
+ * more desirable -- discard b_trk.
+ */
+ goto delete_b;
+
+ /*
+ * Case #6: a_trk is a superset of b_trk, but b_trk is more
+ * desirable: keep both but delete b_trk's key range from a_trk.
+ */
+ WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
+ F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
+ goto merge;
+ }
+
+ if (stop_cmp < 0) { /* Case #3/7 */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+ /*
+ * Case #3/7: a_trk is more desirable, delete a_trk's
+ * key range from b_trk;
+ */
+ WT_RET(__slvg_row_trk_update_start(
+ session, A_TRK_STOP, b_slot, ss));
+ F_SET(b_trk, WT_TRACK_CHECK_START | WT_TRACK_MERGE);
+ } else {
+ /*
+ * Case #3/7: b_trk is more desirable, delete b_trk's
+ * key range from a_trk;
+ */
+ WT_RET(__slvg_key_copy(
+ session, A_TRK_STOP, B_TRK_START));
+ F_SET(a_trk, WT_TRACK_CHECK_STOP | WT_TRACK_MERGE);
+ }
+ goto merge;
+ }
+
+ /*
+ * Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
+ * discard b_trk.
+ */
+ if (a_trk->trk_gen > b_trk->trk_gen) {
+delete_b: /*
+ * After page and overflow reconciliation, one (and only one)
+ * page can reference an overflow record. But, if we split a
+ * page into multiple chunks, any of the chunks might own any
+ * of the backing overflow records, so overflow records won't
+ * normally be discarded until after the merge phase completes.
+ * (The merge phase is where the final pages are written, and
+ * we figure out which overflow records are actually used.)
+ * If freeing a chunk and there are no other references to the
+ * underlying shared information, the overflow records must be
+ * useless, discard them to keep the final file size small.
+ */
+ if (b_trk->shared->ref == 1)
+ for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
+ WT_RET(__slvg_trk_free(session,
+ &ss->ovfl[b_trk->trk_ovfl_slot[i]], 1));
+ return (__slvg_trk_free(session, &ss->pages[b_slot], 1));
+ }
+
+ /*
+ * Case #5: b_trk is more desirable and is a middle chunk of a_trk.
+ * Split a_trk into two parts, the key range before b_trk and the
+ * key range after b_trk.
+ */
+ WT_RET(__slvg_trk_split(session, a_trk, &new));
+
+ /*
+ * Second, reallocate the array of pages if necessary, and then insert
+ * the new element into the array after the existing element (that's
+ * probably wrong, but we'll fix it up in a second).
+ */
+ WT_RET(__wt_realloc_def(
+ session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
+ memmove(ss->pages + a_slot + 1, ss->pages + a_slot,
+ (ss->pages_next - a_slot) * sizeof(*ss->pages));
+ ss->pages[a_slot + 1] = new;
+ ++ss->pages_next;
+
+ /*
+ * Third, set its its stop key to be the stop key of the original chunk,
+ * and call __slvg_row_trk_update_start. That function will both set
+ * the start key to be the first key after the stop key of the middle
+ * chunk (that's b_trk), and re-sort the WT_TRACK array as necessary to
+ * move our new entry into the right sorted location.
+ */
+ WT_RET(__slvg_key_copy(session, &new->row_stop, A_TRK_STOP));
+ WT_RET(
+ __slvg_row_trk_update_start(session, B_TRK_STOP, a_slot + 1, ss));
+
+ /*
+ * Fourth, set the original WT_TRACK information to reference only
+ * the initial key space in the page, that is, everything up to the
+ * starting key of the middle chunk (that's b_trk).
+ */
+ WT_RET(__slvg_key_copy(session, A_TRK_STOP, B_TRK_START));
+ F_SET(new, WT_TRACK_CHECK_START);
+ F_SET(a_trk, WT_TRACK_CHECK_STOP);
+
+ F_SET(new, WT_TRACK_MERGE);
+ F_SET(a_trk, WT_TRACK_MERGE);
+
+merge: WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s and %s require merge",
+ __wt_addr_string(
+ session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(
+ session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2)));
+ return (0);
+}
+
+/*
+ * __slvg_row_trk_update_start --
+ * Update a row-store page's start key after an overlap.
+ */
+static int
+__slvg_row_trk_update_start(
+ WT_SESSION_IMPL *session, WT_ITEM *stop, uint32_t slot, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(dsk);
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_TRACK *trk;
+ uint32_t i;
+ int cmp, found;
+
+ btree = S2BT(session);
+ page = NULL;
+ found = 0;
+
+ trk = ss->pages[slot];
+
+ /*
+ * If we deleted an initial piece of the WT_TRACK name space, it may no
+ * longer be in the right location.
+ *
+ * For example, imagine page #1 has the key range 30-50, it split, and
+ * we wrote page #2 with key range 30-40, and page #3 key range with
+ * 40-50, where pages #2 and #3 have larger LSNs than page #1. When the
+ * key ranges were sorted, page #2 came first, then page #1 (because of
+ * their earlier start keys than page #3), and page #2 came before page
+ * #1 because of its LSN. When we resolve the overlap between page #2
+ * and page #1, we truncate the initial key range of page #1, and it now
+ * sorts after page #3, because it has the same starting key of 40, and
+ * a lower LSN.
+ *
+ * First, update the WT_TRACK start key based on the specified stop key.
+ *
+ * Read and instantiate the WT_TRACK page (we don't have to verify the
+ * page, nor do we have to be quiet on error, we've already read this
+ * page successfully).
+ */
+ WT_RET(__wt_scr_alloc(session, trk->trk_size, &dsk));
+ WT_ERR(__wt_bt_read(session, dsk, trk->trk_addr, trk->trk_addr_size));
+ WT_ERR(__wt_page_inmem(session, NULL, dsk->mem, 0, &page));
+
+ /*
+ * Walk the page, looking for a key sorting greater than the specified
+ * stop key -- that's our new start key.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ WT_ROW_FOREACH(page, rip, i) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+ WT_ERR(__wt_compare(session, btree->collator, key, stop, &cmp));
+ if (cmp > 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ /*
+ * We know that at least one key on the page sorts after the specified
+ * stop key, otherwise the page would have entirely overlapped and we
+ * would have discarded it, we wouldn't be here. Therefore, this test
+ * is safe. (But, it never hurts to check.)
+ */
+ WT_ERR_TEST(!found, WT_ERROR);
+ WT_ERR(__slvg_key_copy(session, &trk->row_start, key));
+
+ /*
+ * We may need to re-sort some number of elements in the list. Walk
+ * forward in the list until reaching an entry which cannot overlap
+ * the adjusted entry. If it's more than a single slot, re-sort the
+ * entries.
+ */
+ for (i = slot + 1; i < ss->pages_next; ++i) {
+ if (ss->pages[i] == NULL)
+ continue;
+ WT_ERR(__wt_compare(session,
+ btree->collator, SLOT_START(i), &trk->row_stop, &cmp));
+ if (cmp > 0)
+ break;
+ }
+ i -= slot;
+ if (i > 1)
+ qsort(ss->pages + slot, (size_t)i,
+ sizeof(WT_TRACK *), __slvg_trk_compare_key);
+
+err: if (page != NULL)
+ __wt_page_out(session, &page);
+ __wt_scr_free(&dsk);
+ __wt_scr_free(&key);
+
+ return (ret);
+}
+
+/*
+ * __slvg_row_build_internal --
+ * Build a row-store in-memory page that references all of the leaf
+ * pages we've found.
+ */
+static int
+__slvg_row_build_internal(
+ WT_SESSION_IMPL *session, uint32_t leaf_cnt, WT_STUFF *ss)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *ref, **refp;
+ WT_TRACK *trk;
+ uint32_t i;
+
+ addr = NULL;
+
+ /* Allocate a row-store root (internal) page and fill it in. */
+ WT_RET(
+ __wt_page_alloc(session, WT_PAGE_ROW_INT, 0, leaf_cnt, 1, &page));
+ WT_ERR(__slvg_modify_init(session, page));
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ for (refp = pindex->index, i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+
+ ref = *refp++;
+ ref->home = page;
+ ref->page = NULL;
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ WT_ERR(__wt_strndup(
+ session, trk->trk_addr, trk->trk_addr_size, &addr->addr));
+ addr->size = trk->trk_addr_size;
+ addr->type =
+ trk->trk_ovfl_cnt == 0 ? WT_ADDR_LEAF_NO : WT_ADDR_LEAF;
+ ref->addr = addr;
+ addr = NULL;
+
+ __wt_ref_key_clear(ref);
+ ref->state = WT_REF_DISK;
+
+ /*
+ * If the page's key range is unmodified from when we read it
+ * (in other words, we didn't merge part of this page with
+ * another page), we can use the page without change, and the
+ * only thing we need to do is mark all overflow records the
+ * page references as in-use.
+ *
+ * If we did merge with another page, we have to build a page
+ * reflecting the updated key range. Note, that requires an
+ * additional pass to free the merge page's backing blocks.
+ */
+ if (F_ISSET(trk, WT_TRACK_MERGE)) {
+ ss->merge_free = 1;
+
+ WT_ERR(__slvg_row_build_leaf(session, trk, ref, ss));
+ } else {
+ WT_ERR(__wt_row_ikey_incr(session, page, 0,
+ trk->row_start.data, trk->row_start.size,
+ &ref->key.ikey));
+
+ WT_ERR(__slvg_ovfl_ref_all(session, trk));
+ }
+ ++ref;
+ }
+
+ __wt_root_ref_init(&ss->root_ref, page, 0);
+
+ if (0) {
+err: if (addr != NULL)
+ __wt_free(session, addr);
+ __wt_page_out(session, &page);
+ }
+ return (ret);
+}
+
+/*
+ * __slvg_row_build_leaf --
+ * Build a row-store leaf page for a merged page.
+ */
+static int
+__slvg_row_build_leaf(
+ WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref, WT_STUFF *ss)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SALVAGE_COOKIE *cookie, _cookie;
+ uint32_t i, skip_start, skip_stop;
+ int cmp;
+
+ btree = S2BT(session);
+ page = NULL;
+
+ cookie = &_cookie;
+ WT_CLEAR(*cookie);
+
+ /* Allocate temporary space in which to instantiate the keys. */
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /* Get the original page, including the full in-memory setup. */
+ WT_ERR(__wt_page_in(session, ref, 0));
+ page = ref->page;
+
+ /*
+ * Figure out how many page keys we want to take and how many we want
+ * to skip.
+ *
+ * If checking the starting range key, the key we're searching for will
+ * be equal to the starting range key. This is because we figured out
+ * the true merged-page start key as part of discarding initial keys
+ * from the page (see the __slvg_row_range_overlap function, and its
+ * calls to __slvg_row_trk_update_start for more information).
+ *
+ * If checking the stopping range key, we want the keys on the page that
+ * are less-than the stopping range key. This is because we copied a
+ * key from another page to define this page's stop range: that page is
+ * the page that owns the "equal to" range space.
+ */
+ skip_start = skip_stop = 0;
+ if (F_ISSET(trk, WT_TRACK_CHECK_START))
+ WT_ROW_FOREACH(page, rip, i) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+
+ /*
+ * >= is correct: see the comment above.
+ */
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &trk->row_start, &cmp));
+ if (cmp >= 0)
+ break;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+ WT_ERR(__wt_buf_set_printable(session,
+ ss->tmp1, key->data, key->size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding leading key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size,
+ ss->tmp2), (int)ss->tmp1->size,
+ (char *)ss->tmp1->data));
+ }
+ ++skip_start;
+ }
+ if (F_ISSET(trk, WT_TRACK_CHECK_STOP))
+ WT_ROW_FOREACH_REVERSE(page, rip, i) {
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+
+ /*
+ * < is correct: see the comment above.
+ */
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &trk->row_stop, &cmp));
+ if (cmp < 0)
+ break;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SALVAGE)) {
+ WT_ERR(__wt_buf_set_printable(session,
+ ss->tmp1, key->data, key->size));
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s merge discarding trailing key %.*s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size,
+ ss->tmp2), (int)ss->tmp1->size,
+ (char *)ss->tmp1->data));
+ }
+ ++skip_stop;
+ }
+
+ /* We should have selected some entries, but not the entire page. */
+ WT_ASSERT(session,
+ skip_start + skip_stop > 0 &&
+ skip_start + skip_stop < page->pg_row_entries);
+
+ /*
+ * Take a copy of this page's first key to define the start of
+ * its range. The key may require processing, otherwise, it's
+ * a copy from the page.
+ */
+ rip = page->pg_row_d + skip_start;
+ WT_ERR(__wt_row_leaf_key(session, page, rip, key, 0));
+ WT_ERR(__wt_row_ikey_incr(session,
+ ref->home, 0, key->data, key->size, &ref->key.ikey));
+
+ /* Set the referenced flag on overflow pages we're using. */
+ if (trk->trk_ovfl_cnt != 0)
+ WT_ERR(__slvg_row_ovfl(session,
+ trk, page, skip_start, page->pg_row_entries - skip_stop));
+
+ /*
+ * Change the page to reflect the correct record count: there is no
+ * need to copy anything on the page itself, the entries value limits
+ * the number of page items.
+ */
+ page->pg_row_entries -= skip_stop;
+ cookie->skip = skip_start;
+
+ /*
+ * We can't discard the original blocks associated with this page now.
+ * (The problem is we don't want to overwrite any original information
+ * until the salvage run succeeds -- if we free the blocks now, the next
+ * merge page we write might allocate those blocks and overwrite them,
+ * and should the salvage run eventually fail, the original information
+ * would have been lost.) Clear the reference addr so eviction doesn't
+ * free the underlying blocks.
+ */
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ ref->addr = NULL;
+
+ /* Write the new version of the leaf page to disk. */
+ WT_ERR(__slvg_modify_init(session, page));
+ WT_ERR(__wt_rec_write(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+
+ /* Reset the page. */
+ page->pg_row_entries += skip_stop;
+
+ /*
+ * Discard our hazard pointer and evict the page, updating the
+ * parent's reference.
+ */
+ ret = __wt_page_release(session, ref, 0);
+ if (ret == 0)
+ ret = __wt_rec_evict(session, ref, 1);
+
+ if (0) {
+err: WT_TRET(__wt_page_release(session, ref, 0));
+ }
+ __wt_scr_free(&key);
+
+ return (ret);
+}
+
+/*
+ * __slvg_row_ovfl_single --
+ * Find a single overflow record in the merge page's list, and mark it as
+ * referenced.
+ */
+static int
+__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL *cell)
+{
+ WT_CELL_UNPACK unpack;
+ WT_TRACK *ovfl;
+ uint32_t i;
+
+ /* Unpack the cell, and check if it's an overflow record. */
+ __wt_cell_unpack(cell, &unpack);
+ if (unpack.type != WT_CELL_KEY_OVFL &&
+ unpack.type != WT_CELL_VALUE_OVFL)
+ return (0);
+
+ /*
+ * Search the list of overflow records for this page -- we should find
+ * exactly one match, and we mark it as referenced.
+ */
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i) {
+ ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]];
+ if (unpack.size == ovfl->trk_addr_size &&
+ memcmp(unpack.data, ovfl->trk_addr, unpack.size) == 0)
+ return (__slvg_ovfl_ref(session, ovfl, 1));
+ }
+
+ WT_PANIC_RET(session,
+ EINVAL, "overflow record at row-store page merge not found");
+}
+
+/*
+ * __slvg_row_ovfl --
+ * Mark overflow items referenced by the merged page.
+ */
+static int
+__slvg_row_ovfl(WT_SESSION_IMPL *session,
+ WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop)
+{
+ WT_CELL *cell;
+ WT_ROW *rip;
+ void *copy;
+
+ /*
+ * We're merging a row-store page, and we took some number of records,
+ * figure out which (if any) overflow records we used.
+ */
+ for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) {
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, NULL, &cell, NULL, NULL);
+ if (cell != NULL)
+ WT_RET(__slvg_row_ovfl_single(session, trk, cell));
+ cell = __wt_row_leaf_value_cell(page, rip, NULL);
+ if (cell != NULL)
+ WT_RET(__slvg_row_ovfl_single(session, trk, cell));
+ }
+ return (0);
+}
+
+/*
+ * __slvg_trk_compare_addr --
+ * Compare two WT_TRACK array entries by address cookie.
+ */
+static int
+__slvg_trk_compare_addr(const void *a, const void *b)
+{
+ WT_DECL_RET;
+ WT_TRACK *a_trk, *b_trk;
+ size_t len;
+
+ a_trk = *(WT_TRACK **)a;
+ b_trk = *(WT_TRACK **)b;
+
+ /*
+ * We don't care about the order because these are opaque cookies --
+ * we're just sorting them so we can binary search instead of linear
+ * search.
+ */
+ len = WT_MIN(a_trk->trk_addr_size, b_trk->trk_addr_size);
+ ret = memcmp(a_trk->trk_addr, b_trk->trk_addr, len);
+ if (ret == 0)
+ ret = a_trk->trk_addr_size > b_trk->trk_addr_size ? -1 : 1;
+ return (ret);
+}
+
+/*
+ * __slvg_ovfl_compare --
+ * Bsearch comparison routine for the overflow array.
+ */
+static int
+__slvg_ovfl_compare(const void *a, const void *b)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_TRACK *trk;
+ size_t len;
+
+ addr = (WT_ADDR *)a;
+ trk = *(WT_TRACK **)b;
+
+ len = WT_MIN(trk->trk_addr_size, addr->size);
+ ret = memcmp(addr->addr, trk->trk_addr, len);
+ if (ret == 0 && addr->size != trk->trk_addr_size)
+ ret = addr->size < trk->trk_addr_size ? -1 : 1;
+ return (ret);
+}
+
+/*
+ * __slvg_ovfl_reconcile --
+ * Review relationships between leaf pages and the overflow pages, delete
+ * leaf pages until there's a one-to-one relationship between leaf and overflow
+ * pages.
+ */
+static int
+__slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_TRACK **searchp, *trk;
+ uint32_t i, j, *slot;
+
+ slot = NULL;
+
+ /*
+ * If an overflow page is referenced more than once, discard leaf pages
+ * with the lowest LSNs until overflow pages are only referenced once.
+ *
+ * This requires sorting the page list by LSN, and the overflow array
+ * by address cookie.
+ */
+ qsort(ss->pages,
+ (size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_gen);
+ qsort(ss->ovfl,
+ (size_t)ss->ovfl_next, sizeof(WT_TRACK *), __slvg_trk_compare_addr);
+
+ /*
+ * Walk the list of pages and discard any pages referencing non-existent
+ * overflow pages or referencing overflow pages also referenced by pages
+ * with higher LSNs. Our caller sorted the page list by LSN, high to
+ * low, so we don't have to do explicit testing of the page LSNs, the
+ * first page to reference an overflow page is the best page to own it.
+ */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL || trk->trk_ovfl_cnt == 0)
+ continue;
+
+ WT_ERR(__wt_calloc_def(session, trk->trk_ovfl_cnt, &slot));
+ for (j = 0; j < trk->trk_ovfl_cnt; ++j) {
+ addr = &trk->trk_ovfl_addr[j];
+ searchp = bsearch(addr, ss->ovfl, ss->ovfl_next,
+ sizeof(WT_TRACK *), __slvg_ovfl_compare);
+
+ /*
+ * If the overflow page doesn't exist or if another page
+ * has already claimed it, this leaf page isn't usable.
+ */
+ if (searchp != NULL &&
+ !F_ISSET(*searchp, WT_TRACK_OVFL_REFD)) {
+ /*
+ * Convert each block address into a slot in the
+ * list of overflow pages as we go.
+ */
+ slot[j] = (uint32_t)(searchp - ss->ovfl);
+ F_SET(*searchp, WT_TRACK_OVFL_REFD);
+ continue;
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s references unavailable overflow page %s",
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, ss->tmp1),
+ __wt_addr_string(session,
+ addr->addr, addr->size, ss->tmp2)));
+
+ /*
+ * Clear the "referenced" flag for any overflow pages
+ * already claimed by this leaf page some other page
+ * might claim them.
+ */
+ while (j > 0)
+ F_CLR(ss->ovfl[slot[--j]], WT_TRACK_OVFL_REFD);
+ trk = NULL;
+ WT_ERR(__slvg_trk_free(session, &ss->pages[i], 1));
+ break;
+ }
+
+ /*
+ * We now have a reference to the overflow WT_TRACK, and so no
+ * longer need the page's address array, discard it. Note, we
+ * potentially freed the WT_TRACK in the loop above, check it's
+ * still valid.
+ */
+ if (trk == NULL)
+ __wt_free(session, slot);
+ else {
+ __slvg_trk_free_addr(session, trk);
+
+ trk->trk_ovfl_slot = slot;
+ slot = NULL;
+ }
+ }
+ return (0);
+
+err: __wt_free(session, slot);
+ return (ret);
+}
+
+/*
+ * __slvg_trk_compare_key --
+ * Compare two WT_TRACK array entries by key, and secondarily, by LSN.
+ */
+static int
+__slvg_trk_compare_key(const void *a, const void *b)
+{
+ WT_SESSION_IMPL *session;
+ WT_TRACK *a_trk, *b_trk;
+ uint64_t a_gen, a_recno, b_gen, b_recno;
+ int cmp;
+
+ a_trk = *(WT_TRACK **)a;
+ b_trk = *(WT_TRACK **)b;
+
+ if (a_trk == NULL)
+ return (b_trk == NULL ? 0 : 1);
+ if (b_trk == NULL)
+ return (-1);
+
+ switch (a_trk->ss->page_type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ a_recno = a_trk->col_start;
+ b_recno = b_trk->col_start;
+ if (a_recno == b_recno)
+ break;
+ if (a_recno > b_recno)
+ return (1);
+ if (a_recno < b_recno)
+ return (-1);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * XXX
+ * __wt_compare can potentially fail, and we're ignoring that
+ * error because this routine is called as an underlying qsort
+ * routine.
+ */
+ session = a_trk->ss->session;
+ (void)__wt_compare(session, S2BT(session)->collator,
+ &a_trk->row_start, &b_trk->row_start, &cmp);
+ if (cmp != 0)
+ return (cmp);
+ break;
+ }
+
+ /*
+ * If the primary keys compare equally, differentiate based on LSN.
+ * Sort from highest LSN to lowest, that is, the earlier pages in
+ * the array are more desirable.
+ */
+ a_gen = a_trk->trk_gen;
+ b_gen = b_trk->trk_gen;
+ return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0));
+}
+
+/*
+ * __slvg_trk_compare_gen --
+ * Compare two WT_TRACK array entries by LSN.
+ */
+static int
+__slvg_trk_compare_gen(const void *a, const void *b)
+{
+ WT_TRACK *a_trk, *b_trk;
+ uint64_t a_gen, b_gen;
+
+ a_trk = *(WT_TRACK **)a;
+ b_trk = *(WT_TRACK **)b;
+
+ /*
+ * Sort from highest LSN to lowest, that is, the earlier pages in the
+ * array are more desirable.
+ */
+ a_gen = a_trk->trk_gen;
+ b_gen = b_trk->trk_gen;
+ return (a_gen > b_gen ? -1 : (a_gen < b_gen ? 1 : 0));
+}
+
+/*
+ * __slvg_merge_block_free --
+ * Clean up backing file and overflow blocks after the merge phase.
+ */
+static int
+__slvg_merge_block_free(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint32_t i;
+
+ /* Free any underlying file blocks for merged pages. */
+ for (i = 0; i < ss->pages_next; ++i) {
+ if ((trk = ss->pages[i]) == NULL)
+ continue;
+ if (F_ISSET(trk, WT_TRACK_MERGE))
+ WT_RET(__slvg_trk_free(session, &ss->pages[i], 1));
+ }
+
+ /* Free any unused overflow records. */
+ return (__slvg_ovfl_discard(session, ss));
+}
+
+/*
+ * __slvg_ovfl_ref --
+ * Reference an overflow page, checking for multiple references.
+ */
+static int
+__slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, int multi_panic)
+{
+ if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
+ if (!multi_panic)
+ return (EBUSY);
+ WT_PANIC_RET(session, EINVAL,
+ "overflow record unexpectedly referenced multiple times "
+ "during leaf page merge");
+ }
+
+ F_SET(trk, WT_TRACK_OVFL_REFD);
+ return (0);
+}
+
+/*
+ * __slvg_ovfl_ref_all --
+ * Reference all of the page's overflow pages.
+ */
+static int
+__slvg_ovfl_ref_all(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+ uint32_t i;
+
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i)
+ WT_RET(__slvg_ovfl_ref(
+ session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1));
+
+ return (0);
+}
+
+/*
+ * __slvg_ovfl_discard --
+ * Discard unused overflow pages.
+ */
+static int
+__slvg_ovfl_discard(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ WT_TRACK *trk;
+ uint32_t i;
+
+ /*
+ * Walk the overflow page array: if an overflow page isn't referenced,
+ * add its file blocks to the free list.
+ *
+ * Clear the reference flag (it's reused to figure out if the overflow
+ * record is referenced, but never used, by merged pages).
+ */
+ for (i = 0; i < ss->ovfl_next; ++i) {
+ if ((trk = ss->ovfl[i]) == NULL)
+ continue;
+
+ if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) {
+ F_CLR(trk, WT_TRACK_OVFL_REFD);
+ continue;
+ }
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s unused overflow page",
+ __wt_addr_string(
+ session, trk->trk_addr, trk->trk_addr_size, ss->tmp1)));
+ WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 1));
+ }
+
+ return (0);
+}
+
+/*
+ * __slvg_cleanup --
+ * Discard memory allocated to the page and overflow arrays.
+ */
+static int
+__slvg_cleanup(WT_SESSION_IMPL *session, WT_STUFF *ss)
+{
+ uint32_t i;
+
+ /* Discard the leaf page array. */
+ for (i = 0; i < ss->pages_next; ++i)
+ if (ss->pages[i] != NULL)
+ WT_RET(__slvg_trk_free(session, &ss->pages[i], 0));
+ __wt_free(session, ss->pages);
+
+ /* Discard the ovfl page array. */
+ for (i = 0; i < ss->ovfl_next; ++i)
+ if (ss->ovfl[i] != NULL)
+ WT_RET(__slvg_trk_free(session, &ss->ovfl[i], 0));
+ __wt_free(session, ss->ovfl);
+
+ return (0);
+}
+
+/*
+ * __slvg_trk_free_addr --
+ * Discard address information.
+ */
+static void
+__slvg_trk_free_addr(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+ uint32_t i;
+
+ if (trk->trk_ovfl_addr != NULL) {
+ for (i = 0; i < trk->trk_ovfl_cnt; ++i)
+ __wt_free(session, trk->trk_ovfl_addr[i].addr);
+ __wt_free(session, trk->trk_ovfl_addr);
+ }
+}
+
+/*
+ * __slvg_trk_free_block --
+ * Discard underlying blocks.
+ */
+static int
+__slvg_trk_free_block(WT_SESSION_IMPL *session, WT_TRACK *trk)
+{
+ WT_BM *bm;
+
+ bm = S2BT(session)->bm;
+
+ /*
+ * If freeing underlying file blocks or overflow pages, this is a page
+ * we were tracking but eventually decided not to use.
+ */
+ WT_RET(__wt_verbose(session, WT_VERB_SALVAGE,
+ "%s blocks discarded: discard freed file bytes %" PRIu32,
+ __wt_addr_string(session,
+ trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), trk->trk_size));
+
+ return (bm->free(bm, session, trk->trk_addr, trk->trk_addr_size));
+}
+
+/*
+ * __slvg_trk_free --
+ * Discard a WT_TRACK structure and (optionally) its underlying blocks.
+ */
+static int
+__slvg_trk_free(WT_SESSION_IMPL *session, WT_TRACK **trkp, int free_on_last_ref)
+{
+ WT_TRACK *trk;
+
+ trk = *trkp;
+ *trkp = NULL;
+
+ /*
+ * If we're the last user of shared information, clean up.
+ */
+ WT_ASSERT(session, trk->shared->ref > 0);
+ if (--trk->shared->ref == 0) {
+ /*
+ * If the free-on-last-ref flag is set, this chunk isn't going
+ * to use the backing physical blocks. As we're the last user
+ * of those blocks, nobody is going to use them and they can be
+ * discarded.
+ */
+ if (free_on_last_ref)
+ WT_RET(__slvg_trk_free_block(session, trk));
+
+ __wt_free(session, trk->trk_addr);
+
+ __slvg_trk_free_addr(session, trk);
+
+ __wt_free(session, trk->trk_ovfl_slot);
+
+ __wt_free(session, trk->shared);
+ }
+
+ if (trk->ss->page_type == WT_PAGE_ROW_LEAF) {
+ __wt_buf_free(session, &trk->row_start);
+ __wt_buf_free(session, &trk->row_stop);
+ }
+
+ __wt_free(session, trk);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
new file mode 100644
index 00000000000..3da0bcf346c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page_row_leaf(WT_PAGE *, WT_DSRC_STATS *);
+
+/*
+ * __wt_btree_stat_init --
+ * Initialize the Btree statistics.
+ */
+int
+__wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_DSRC_STATS *stats;
+ WT_REF *next_walk;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ stats = &btree->dhandle->stats;
+
+ WT_RET(bm->stat(bm, session, stats));
+
+ WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
+ WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
+ WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem);
+ WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
+ WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem);
+ WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
+
+ /* Everything else is really, really expensive. */
+ if (!F_ISSET(cst, WT_CONN_STAT_ALL))
+ return (0);
+
+ next_walk = NULL;
+ while ((ret =
+ __wt_tree_walk(session, &next_walk, 0)) == 0 && next_walk != NULL)
+ WT_RET(__stat_page(session, next_walk->page, stats));
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __stat_page --
+ * Stat any Btree page.
+ */
+static int
+__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * All internal pages and overflow pages are trivial, all we track is
+ * a count of the page type.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ WT_STAT_INCR(stats, btree_column_fix);
+ WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries);
+ break;
+ case WT_PAGE_COL_INT:
+ WT_STAT_INCR(stats, btree_column_internal);
+ pindex = WT_INTL_INDEX_COPY(page);
+ WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+ break;
+ case WT_PAGE_COL_VAR:
+ WT_RET(__stat_page_col_var(page, stats));
+ break;
+ case WT_PAGE_OVFL:
+ WT_STAT_INCR(stats, btree_overflow);
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_STAT_INCR(stats, btree_row_internal);
+ pindex = WT_INTL_INDEX_COPY(page);
+ WT_STAT_INCRV(stats, btree_entries, pindex->entries);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__stat_page_row_leaf(page, stats));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __stat_page_col_var --
+ * Stat a WT_PAGE_COL_VAR page.
+ */
+static int
+__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COL *cip;
+ WT_INSERT *ins;
+ WT_UPDATE *upd;
+ uint32_t i;
+ int orig_deleted;
+
+ unpack = &_unpack;
+
+ WT_STAT_INCR(stats, btree_column_variable);
+
+ /*
+ * Walk the page, counting regular and overflow data items, and checking
+ * to be sure any updates weren't deletions. If the item was updated,
+ * assume it was updated by an item of the same size (it's expensive to
+ * figure out if it will require the same space or not, especially if
+ * there's Huffman encoding).
+ */
+ WT_COL_FOREACH(page, cip, i) {
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ orig_deleted = 1;
+ WT_STAT_INCR(stats, btree_column_deleted);
+ } else {
+ orig_deleted = 0;
+ __wt_cell_unpack(cell, unpack);
+ WT_STAT_INCRV(
+ stats, btree_entries, __wt_cell_rle(unpack));
+ }
+
+ /*
+ * Walk the insert list, checking for changes. For each insert
+ * we find, correct the original count based on its state.
+ */
+ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
+ upd = ins->upd;
+ if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (orig_deleted)
+ continue;
+ WT_STAT_INCR(stats, btree_column_deleted);
+ WT_STAT_DECR(stats, btree_entries);
+ } else {
+ if (!orig_deleted)
+ continue;
+ WT_STAT_DECR(stats, btree_column_deleted);
+ WT_STAT_INCR(stats, btree_entries);
+ }
+ }
+ }
+ return (0);
+}
+
+/*
+ * __stat_page_row_leaf --
+ * Stat a WT_PAGE_ROW_LEAF page.
+ */
+static int
+__stat_page_row_leaf(WT_PAGE *page, WT_DSRC_STATS *stats)
+{
+ WT_INSERT *ins;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ uint32_t cnt, i;
+
+ WT_STAT_INCR(stats, btree_row_leaf);
+
+ /*
+ * Stat any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ cnt = 0;
+ WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
+ if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+ ++cnt;
+
+ /* Stat the page's K/V pairs. */
+ WT_ROW_FOREACH(page, rip, i) {
+ upd = WT_ROW_UPDATE(page, rip);
+ if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd))
+ ++cnt;
+
+ /* Stat inserted K/V pairs. */
+ WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
+ if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+ ++cnt;
+ }
+
+ WT_STAT_INCRV(stats, btree_entries, cnt);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
new file mode 100644
index 00000000000..607e7919513
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -0,0 +1,373 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __sync_file --
+ * Flush pages for a specific file.
+ */
+static int
+__sync_file(WT_SESSION_IMPL *session, int syncop)
+{
+ struct timespec end, start;
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *walk;
+ WT_TXN *txn;
+ uint64_t internal_bytes, leaf_bytes;
+ uint64_t internal_pages, leaf_pages;
+ uint32_t flags;
+
+ btree = S2BT(session);
+
+ flags = WT_READ_CACHE | WT_READ_NO_GEN;
+ walk = NULL;
+ txn = &session->txn;
+
+ internal_bytes = leaf_bytes = 0;
+ internal_pages = leaf_pages = 0;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ WT_RET(__wt_epoch(session, &start));
+
+ switch (syncop) {
+ case WT_SYNC_WRITE_LEAVES:
+ /*
+ * Write all immediately available, dirty in-cache leaf pages.
+ *
+ * Writing the leaf pages is done without acquiring a high-level
+ * lock, serialize so multiple threads don't walk the tree at
+ * the same time.
+ */
+ if (!btree->modified)
+ return (0);
+ __wt_spin_lock(session, &btree->flush_lock);
+ if (!btree->modified) {
+ __wt_spin_unlock(session, &btree->flush_lock);
+ return (0);
+ }
+
+ flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
+ for (walk = NULL;;) {
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
+ if (walk == NULL)
+ break;
+
+ /* Write dirty pages if nobody beat us to it. */
+ page = walk->page;
+ if (__wt_page_is_modified(page)) {
+ if (txn->isolation == TXN_ISO_READ_COMMITTED)
+ __wt_txn_refresh(session, 1);
+ leaf_bytes += page->memory_footprint;
+ ++leaf_pages;
+ WT_ERR(__wt_rec_write(session, walk, NULL, 0));
+ }
+ }
+ break;
+ case WT_SYNC_CHECKPOINT:
+ /*
+ * We cannot check the tree modified flag in the case of a
+ * checkpoint, the checkpoint code has already cleared it.
+ *
+ * Writing the leaf pages is done without acquiring a high-level
+ * lock, serialize so multiple threads don't walk the tree at
+ * the same time. We're holding the schema lock, but need the
+ * lower-level lock as well.
+ */
+ __wt_spin_lock(session, &btree->flush_lock);
+
+ /*
+ * When internal pages are being reconciled by checkpoint their
+ * child pages cannot disappear from underneath them or be split
+ * into them, nor can underlying blocks be freed until the block
+ * lists for the checkpoint are stable. Set the checkpointing
+ * flag to block eviction of dirty pages until the checkpoint's
+ * internal page pass is complete, then wait for any existing
+ * eviction to complete.
+ */
+ btree->checkpointing = 1;
+
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
+ WT_ERR(__wt_evict_file_exclusive_on(session));
+ __wt_evict_file_exclusive_off(session);
+ }
+
+ /* Write all dirty in-cache pages. */
+ flags |= WT_READ_NO_EVICT;
+ for (walk = NULL;;) {
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
+ if (walk == NULL)
+ break;
+
+ /*
+ * Write dirty pages, unless we can be sure they only
+ * became dirty after the checkpoint started.
+ *
+ * We can skip dirty pages if:
+ * (1) they are leaf pages;
+ * (2) there is a snapshot transaction active (which
+ * is the case in ordinary application checkpoints
+ * but not all internal cases); and
+ * (3) the first dirty update on the page is
+ * sufficiently recent that the checkpoint
+ * transaction would skip them.
+ */
+ page = walk->page;
+ mod = page->modify;
+ if (__wt_page_is_modified(page) &&
+ (WT_PAGE_IS_INTERNAL(page) ||
+ !F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
+ TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
+ if (WT_PAGE_IS_INTERNAL(page)) {
+ internal_bytes +=
+ page->memory_footprint;
+ ++internal_pages;
+ } else {
+ leaf_bytes += page->memory_footprint;
+ ++leaf_pages;
+ }
+ WT_ERR(__wt_rec_write(session, walk, NULL, 0));
+ }
+ }
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
+ WT_ERR(__wt_epoch(session, &end));
+ WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
+ "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
+ " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
+ " bytes, %" PRIu64 " pages of internal\n\t"
+ "Took: %" PRIu64 "ms",
+ syncop == WT_SYNC_WRITE_LEAVES ?
+ "WRITE_LEAVES" : "CHECKPOINT",
+ leaf_bytes, leaf_pages, internal_bytes, internal_pages,
+ WT_TIMEDIFF(end, start) / WT_MILLION));
+ }
+
+err: /* On error, clear any left-over tree walk. */
+ if (walk != NULL)
+ WT_TRET(__wt_page_release(session, walk, flags));
+
+ if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0)
+ __wt_txn_release_snapshot(session);
+
+ if (btree->checkpointing) {
+ /*
+ * Clear the checkpoint flag and push the change; not required,
+ * but publishing the change means stalled eviction gets moving
+ * as soon as possible.
+ */
+ btree->checkpointing = 0;
+ WT_FULL_BARRIER();
+
+ /*
+ * Wake the eviction server, in case application threads have
+ * stalled while the eviction server decided it couldn't make
+ * progress. Without this, application threads will be stalled
+ * until the eviction server next wakes.
+ */
+ WT_TRET(__wt_evict_server_wake(session));
+ }
+
+ __wt_spin_unlock(session, &btree->flush_lock);
+
+ /*
+ * Leaves are written before a checkpoint (or as part of a file close,
+ * before checkpointing the file). Start a flush to stable storage,
+ * but don't wait for it.
+ */
+ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
+ WT_RET(btree->bm->sync(btree->bm, session, 1));
+
+ return (ret);
+}
+
+/*
+ * __evict_file --
+ * Discard pages for a specific file.
+ */
+static int
+__evict_file(WT_SESSION_IMPL *session, int syncop)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *next_ref, *ref;
+ int eviction_enabled;
+
+ btree = S2BT(session);
+ eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION);
+
+ /*
+ * We need exclusive access to the file -- disable ordinary eviction
+ * and drain any blocks already queued.
+ */
+ if (eviction_enabled)
+ WT_RET(__wt_evict_file_exclusive_on(session));
+
+ /* Make sure the oldest transaction ID is up-to-date. */
+ __wt_txn_update_oldest(session);
+
+ /* Walk the tree, discarding pages. */
+ next_ref = NULL;
+ WT_ERR(__wt_tree_walk(
+ session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+ while ((ref = next_ref) != NULL) {
+ page = ref->page;
+
+ /*
+ * Eviction can fail when a page in the evicted page's subtree
+ * switches state. For example, if we don't evict a page marked
+ * empty, because we expect it to be merged into its parent, it
+ * might no longer be empty after it's reconciled, in which case
+ * eviction of its parent would fail. We can either walk the
+ * tree multiple times (until it's finally empty), or reconcile
+ * each page to get it to its final state before considering if
+ * it's an eviction target or will be merged into its parent.
+ *
+ * Don't limit this test to any particular page type, that tends
+ * to introduce bugs when the reconciliation of other page types
+ * changes, and there's no advantage to doing so.
+ *
+ * Eviction can also fail because an update cannot be written.
+ * If sessions have disjoint sets of files open, updates in a
+ * no-longer-referenced file may not yet be globally visible,
+ * and the write will fail with EBUSY. Our caller handles that
+ * error, retrying later.
+ */
+ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
+ WT_ERR(__wt_rec_write(session, ref, NULL, WT_EVICTING));
+
+ /*
+ * We can't evict the page just returned to us (it marks our
+ * place in the tree), so move the walk to one page ahead of
+ * the page being evicted. Note, we reconciled the returned
+ * page first: if reconciliation of that page were to change
+ * the shape of the tree, and we did the next walk call before
+ * the reconciliation, the next walk call could miss a page in
+ * the tree.
+ */
+ WT_ERR(__wt_tree_walk(
+ session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
+
+ switch (syncop) {
+ case WT_SYNC_CLOSE:
+ /*
+ * Evict the page.
+ * Do not attempt to evict pages expected to be merged
+ * into their parents, with the exception that the root
+ * page can't be merged, it must be written.
+ */
+ if (__wt_ref_is_root(ref) ||
+ page->modify == NULL ||
+ !F_ISSET(page->modify, WT_PM_REC_EMPTY))
+ WT_ERR(__wt_rec_evict(session, ref, 1));
+ break;
+ case WT_SYNC_DISCARD:
+ case WT_SYNC_DISCARD_FORCE:
+ /*
+ * Discard the page, whether clean or dirty.
+ *
+ * Clean the page, both to keep statistics correct, and
+ * to let the page-discard function assert no dirty page
+ * is ever discarded.
+ */
+ if (__wt_page_is_modified(page)) {
+ page->modify->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ }
+ /*
+ * If the page contains an update that is too recent to
+ * evict, stop. This should never happen during
+ * connection close, and in other paths our caller
+ * should be prepared to deal with this case.
+ */
+ if (syncop == WT_SYNC_DISCARD &&
+ page->modify != NULL &&
+ !__wt_txn_visible_all(session,
+ page->modify->rec_max_txn))
+ return (EBUSY);
+ if (syncop == WT_SYNC_DISCARD_FORCE)
+ F_SET(session, WT_SESSION_DISCARD_FORCE);
+ __wt_ref_out(session, ref);
+ /*
+ * In case we don't discard the whole tree, make sure
+ * that future readers know that the page is no longer
+ * in cache.
+ */
+ ref->state = WT_REF_DISK;
+ F_CLR(session, WT_SESSION_DISCARD_FORCE);
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+ if (0) {
+err: /* On error, clear any left-over tree walk. */
+ if (next_ref != NULL)
+ WT_TRET(__wt_page_release(
+ session, next_ref, WT_READ_NO_EVICT));
+ }
+
+ if (eviction_enabled)
+ __wt_evict_file_exclusive_off(session);
+
+ return (ret);
+}
+
+/*
+ * __wt_cache_op --
+ * Cache operations.
+ */
+int
+__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
+{
+ WT_DECL_RET;
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ switch (op) {
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_CLOSE:
+ /*
+ * Set the checkpoint reference for reconciliation; it's ugly,
+ * but drilling a function parameter path from our callers to
+ * the reconciliation of the tree's root page is going to be
+ * worse.
+ */
+ WT_ASSERT(session, btree->ckpt == NULL);
+ btree->ckpt = ckptbase;
+ break;
+ }
+
+ switch (op) {
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_WRITE_LEAVES:
+ WT_ERR(__sync_file(session, op));
+ break;
+ case WT_SYNC_CLOSE:
+ case WT_SYNC_DISCARD:
+ case WT_SYNC_DISCARD_FORCE:
+ WT_ERR(__evict_file(session, op));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+err: switch (op) {
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_CLOSE:
+ btree->ckpt = NULL;
+ break;
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_upgrade.c b/src/third_party/wiredtiger/src/btree/bt_upgrade.c
new file mode 100644
index 00000000000..d65c8793fbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_upgrade.c
@@ -0,0 +1,22 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_upgrade --
+ * Upgrade a file.
+ */
+int
+__wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_UNUSED(cfg);
+
+ /* There's nothing to upgrade, yet. */
+ WT_RET(__wt_progress(session, NULL, 1));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
new file mode 100644
index 00000000000..e7caf02fd2f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -0,0 +1,666 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's a bunch of stuff we pass around during verification, group it
+ * together to make the code prettier.
+ */
+typedef struct {
+ uint64_t record_total; /* Total record count */
+
+ WT_ITEM *max_key; /* Largest key */
+ WT_ITEM *max_addr; /* Largest key page */
+
+ uint64_t fcnt; /* Progress counter */
+
+ int dump_address; /* Debugging hooks */
+ int dump_pages;
+ int dump_blocks;
+
+ WT_ITEM *tmp1; /* Temporary buffer */
+ WT_ITEM *tmp2; /* Temporary buffer */
+} WT_VSTUFF;
+
+static void __verify_checkpoint_reset(WT_VSTUFF *);
+static int __verify_config(WT_SESSION_IMPL *, const char *[], WT_VSTUFF *);
+static int __verify_config_offsets(WT_SESSION_IMPL *, const char *[], int *);
+static int __verify_overflow(
+ WT_SESSION_IMPL *, const uint8_t *, size_t, WT_VSTUFF *);
+static int __verify_overflow_cell(
+ WT_SESSION_IMPL *, WT_REF *, int *, WT_VSTUFF *);
+static int __verify_row_int_key_order(
+ WT_SESSION_IMPL *, WT_PAGE *, WT_REF *, uint32_t, WT_VSTUFF *);
+static int __verify_row_leaf_key_order(
+ WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
+static int __verify_tree(WT_SESSION_IMPL *, WT_REF *, WT_VSTUFF *);
+
+/*
+ * __wt_verify --
+ * Verify a file.
+ */
+int
+__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT *ckptbase, *ckpt;
+ WT_DECL_RET;
+ WT_VSTUFF *vs, _vstuff;
+ size_t root_addr_size;
+ uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
+ int bm_start, quit;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ ckptbase = NULL;
+ bm_start = 0;
+
+ WT_CLEAR(_vstuff);
+ vs = &_vstuff;
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
+ WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
+
+ /* Check configuration strings. */
+ WT_ERR(__verify_config(session, cfg, vs));
+
+ /* Optionally dump specific block offsets. */
+ WT_ERR(__verify_config_offsets(session, cfg, &quit));
+ if (quit)
+ goto done;
+
+ /* Get a list of the checkpoints for this file. */
+ WT_ERR(
+ __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));
+
+ /* Inform the underlying block manager we're verifying. */
+ WT_ERR(bm->verify_start(bm, session, ckptbase));
+ bm_start = 1;
+
+ /* Loop through the file's checkpoints, verifying each one. */
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ WT_ERR(__wt_verbose(session, WT_VERB_VERIFY,
+ "%s: checkpoint %s", btree->dhandle->name, ckpt->name));
+
+ /* Fake checkpoints require no work. */
+ if (F_ISSET(ckpt, WT_CKPT_FAKE))
+ continue;
+
+ /* House-keeping between checkpoints. */
+ __verify_checkpoint_reset(vs);
+
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+ WT_ERR(__wt_msg(session, "%s: checkpoint %s",
+ btree->dhandle->name, ckpt->name));
+#endif
+ /* Load the checkpoint. */
+ WT_ERR(bm->checkpoint_load(bm, session,
+ ckpt->raw.data, ckpt->raw.size,
+ root_addr, &root_addr_size, 1));
+
+ /*
+ * Ignore trees with no root page.
+ * Verify, then discard the checkpoint from the cache.
+ */
+ if (root_addr_size != 0 &&
+ (ret = __wt_btree_tree_open(
+ session, root_addr, root_addr_size)) == 0) {
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address ||
+ vs->dump_blocks || vs->dump_pages)
+ WT_ERR(__wt_msg(session, "Root: %s %s",
+ __wt_addr_string(session,
+ root_addr, root_addr_size, vs->tmp1),
+ __wt_page_type_string(
+ btree->root.page->type)));
+#endif
+ ret = __verify_tree(session, &btree->root, vs);
+
+ WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+ }
+
+ /* Unload the checkpoint. */
+ WT_TRET(bm->checkpoint_unload(bm, session));
+ WT_ERR(ret);
+ }
+
+done:
+err: /* Inform the underlying block manager we're done. */
+ if (bm_start)
+ WT_TRET(bm->verify_end(bm, session));
+
+ /* Discard the list of checkpoints. */
+ if (ckptbase != NULL)
+ __wt_meta_ckptlist_free(session, ckptbase);
+
+ /* Wrap up reporting. */
+ WT_TRET(__wt_progress(session, NULL, vs->fcnt));
+
+ /* Free allocated memory. */
+ __wt_scr_free(&vs->max_key);
+ __wt_scr_free(&vs->max_addr);
+ __wt_scr_free(&vs->tmp1);
+ __wt_scr_free(&vs->tmp2);
+
+ return (ret);
+}
+
+/*
+ * __verify_config --
+ * Debugging: verification supports dumping pages in various formats.
+ */
+static int
+__verify_config(WT_SESSION_IMPL *session, const char *cfg[], WT_VSTUFF *vs)
+{
+ WT_CONFIG_ITEM cval;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_address", &cval));
+ vs->dump_address = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_blocks", &cval));
+ vs->dump_blocks = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_pages", &cval));
+ vs->dump_pages = cval.val != 0;
+
+#if !defined(HAVE_DIAGNOSTIC)
+ if (vs->dump_address || vs->dump_blocks || vs->dump_pages)
+ WT_RET_MSG(session, ENOTSUP,
+ "the WiredTiger library was not built in diagnostic mode");
+#endif
+ return (0);
+}
+
+/*
+ * __verify_config_offsets --
+ * Debugging: optionally dump specific blocks from the file.
+ */
+static int
+__verify_config_offsets(WT_SESSION_IMPL *session, const char *cfg[], int *quitp)
+{
+ WT_CONFIG list;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_RET;
+ u_long offset;
+
+ *quitp = 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "dump_offsets", &cval));
+ WT_RET(__wt_config_subinit(session, &list, &cval));
+ while ((ret = __wt_config_next(&list, &k, &v)) == 0) {
+ /*
+ * Quit after dumping the requested blocks. (That's hopefully
+ * what the user wanted, all of this stuff is just hooked into
+ * verify because that's where we "dump blocks" for debugging.)
+ */
+ *quitp = 1;
+ if (v.len != 0 || sscanf(k.str, "%lu", &offset) != 1)
+ WT_RET_MSG(session, EINVAL,
+ "unexpected dump offset format");
+#if !defined(HAVE_DIAGNOSTIC)
+ WT_RET_MSG(session, ENOTSUP,
+ "the WiredTiger library was not built in diagnostic mode");
+#else
+ WT_TRET(
+ __wt_debug_offset_blind(session, (wt_off_t)offset, NULL));
+#endif
+ }
+ return (ret == WT_NOTFOUND ? 0 : ret);
+}
+
+/*
+ * __verify_checkpoint_reset --
+ * Reset anything needing to be reset for each new checkpoint verification.
+ */
+static void
+__verify_checkpoint_reset(WT_VSTUFF *vs)
+{
+ /*
+ * Key order is per checkpoint, reset the data length that serves as a
+ * flag value.
+ */
+ vs->max_addr->size = 0;
+
+ /* Record total is per checkpoint, reset the record count. */
+ vs->record_total = 0;
+}
+
+/*
+ * __verify_tree --
+ * Verify a tree, recursively descending through it in depth-first fashion.
+ * The page argument was physically verified (so we know it's correctly formed),
+ * and the in-memory version built. Our job is to check logical relationships
+ * in the page and in the tree.
+ */
+static int
+__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
+{
+ WT_BM *bm;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COL *cip;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_REF *child_ref;
+ uint64_t recno;
+ uint32_t entry, i;
+ int found;
+
+ bm = S2BT(session)->bm;
+ page = ref->page;
+
+ unpack = &_unpack;
+ WT_CLEAR(*unpack); /* -Wuninitialized */
+
+ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_page_type_string(page->type)));
+#ifdef HAVE_DIAGNOSTIC
+ if (vs->dump_address)
+ WT_RET(__wt_msg(session, "%s %s",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_page_type_string(page->type)));
+#endif
+
+ /*
+ * The page's physical structure was verified when it was read into
+ * memory by the read server thread, and then the in-memory version
+ * of the page was built. Now we make sure the page and tree are
+ * logically consistent.
+ *
+ * !!!
+ * The problem: (1) the read server has to build the in-memory version
+ * of the page because the read server is the thread that flags when
+ * any thread can access the page in the tree; (2) we can't build the
+ * in-memory version of the page until the physical structure is known
+ * to be OK, so the read server has to verify at least the physical
+ * structure of the page; (3) doing complete page verification requires
+ * reading additional pages (for example, overflow keys imply reading
+ * overflow pages in order to test the key's order in the page); (4)
+ * the read server cannot read additional pages because it will hang
+ * waiting on itself. For this reason, we split page verification
+ * into a physical verification, which allows the in-memory version
+ * of the page to be built, and then a subsequent logical verification
+ * which happens here.
+ *
+ * Report progress every 10 pages.
+ */
+ if (++vs->fcnt % 10 == 0)
+ WT_RET(__wt_progress(session, NULL, vs->fcnt));
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Optionally dump the blocks or page in debugging mode. */
+ if (vs->dump_blocks)
+ WT_RET(__wt_debug_disk(session, page->dsk, NULL));
+ if (vs->dump_pages)
+ WT_RET(__wt_debug_page(session, page, NULL));
+#endif
+
+ /*
+ * Column-store key order checks: check the page's record number and
+ * then update the total record count.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ recno = page->pg_fix_recno;
+ goto recno_chk;
+ case WT_PAGE_COL_INT:
+ recno = page->pg_intl_recno;
+ goto recno_chk;
+ case WT_PAGE_COL_VAR:
+ recno = page->pg_var_recno;
+recno_chk: if (recno != vs->record_total + 1)
+ WT_RET_MSG(session, WT_ERROR,
+ "page at %s has a starting record of %" PRIu64
+ " when the expected starting record is %" PRIu64,
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ recno, vs->record_total + 1);
+ break;
+ }
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ vs->record_total += page->pg_fix_entries;
+ break;
+ case WT_PAGE_COL_VAR:
+ recno = 0;
+ WT_COL_FOREACH(page, cip, i)
+ if ((cell = WT_COL_PTR(page, cip)) == NULL)
+ ++recno;
+ else {
+ __wt_cell_unpack(cell, unpack);
+ recno += __wt_cell_rle(unpack);
+ }
+ vs->record_total += recno;
+ break;
+ }
+
+ /*
+ * Row-store leaf page key order check: it's a depth-first traversal,
+ * the first key on this page should be larger than any key previously
+ * seen.
+ */
+ switch (page->type) {
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__verify_row_leaf_key_order(session, ref, vs));
+ break;
+ }
+
+ /* If it's not the root page, unpack the parent cell. */
+ if (!__wt_ref_is_root(ref)) {
+ __wt_cell_unpack(ref->addr, unpack);
+
+ /* Compare the parent cell against the page type. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
+ goto celltype_err;
+ break;
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ if (unpack->raw != WT_CELL_ADDR_LEAF &&
+ unpack->raw != WT_CELL_ADDR_LEAF_NO)
+ goto celltype_err;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ if (unpack->raw != WT_CELL_ADDR_INT)
+celltype_err: WT_RET_MSG(session, WT_ERROR,
+ "page at %s, of type %s, is referenced in "
+ "its parent by a cell of type %s",
+ __wt_page_addr_string(
+ session, ref, vs->tmp1),
+ __wt_page_type_string(page->type),
+ __wt_cell_type_string(unpack->raw));
+ break;
+ }
+ }
+
+ /*
+ * Check overflow pages. We check overflow cells separately from other
+ * tests that walk the page as it's simpler, and I don't care much how
+ * fast table verify runs.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__verify_overflow_cell(session, ref, &found, vs));
+ if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
+ break;
+
+ /*
+ * Object if a leaf-no-overflow address cell references a page
+ * with overflow keys, but don't object if a leaf address cell
+ * references a page without overflow keys. Reconciliation
+ * doesn't guarantee every leaf page without overflow items will
+ * be a leaf-no-overflow type.
+ */
+ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
+ WT_RET_MSG(session, WT_ERROR,
+ "page at %s, of type %s and referenced in its "
+ "parent by a cell of type %s, contains overflow "
+ "items",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_page_type_string(page->type),
+ __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
+ break;
+ }
+
+ /* Check tree connections and recursively descend the tree. */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ /* For each entry in an internal page, verify the subtree. */
+ entry = 0;
+ WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
+ /*
+ * It's a depth-first traversal: this entry's starting
+ * record number should be 1 more than the total records
+ * reviewed to this point.
+ */
+ ++entry;
+ if (child_ref->key.recno != vs->record_total + 1) {
+ WT_RET_MSG(session, WT_ERROR,
+ "the starting record number in entry %"
+ PRIu32 " of the column internal page at "
+ "%s is %" PRIu64 " and the expected "
+ "starting record number is %" PRIu64,
+ entry,
+ __wt_page_addr_string(
+ session, child_ref, vs->tmp1),
+ child_ref->key.recno,
+ vs->record_total + 1);
+ }
+
+ /* Verify the subtree. */
+ WT_RET(__wt_page_in(session, child_ref, 0));
+ ret = __verify_tree(session, child_ref, vs);
+ WT_TRET(__wt_page_release(session, child_ref, 0));
+ WT_RET(ret);
+
+ __wt_cell_unpack(child_ref->addr, unpack);
+ WT_RET(bm->verify_addr(
+ bm, session, unpack->data, unpack->size));
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ /* For each entry in an internal page, verify the subtree. */
+ entry = 0;
+ WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
+ /*
+ * It's a depth-first traversal: this entry's starting
+ * key should be larger than the largest key previously
+ * reviewed.
+ *
+ * The 0th key of any internal page is magic, and we
+ * can't test against it.
+ */
+ ++entry;
+ if (entry != 1)
+ WT_RET(__verify_row_int_key_order(
+ session, page, child_ref, entry, vs));
+
+ /* Verify the subtree. */
+ WT_RET(__wt_page_in(session, child_ref, 0));
+ ret = __verify_tree(session, child_ref, vs);
+ WT_TRET(__wt_page_release(session, child_ref, 0));
+ WT_RET(ret);
+
+ __wt_cell_unpack(child_ref->addr, unpack);
+ WT_RET(bm->verify_addr(
+ bm, session, unpack->data, unpack->size));
+ } WT_INTL_FOREACH_END;
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __verify_row_int_key_order --
+ * Compare a key on an internal page to the largest key we've seen so
+ * far; update the largest key we've seen so far to that key.
+ */
+static int
+__verify_row_int_key_order(WT_SESSION_IMPL *session,
+ WT_PAGE *parent, WT_REF *ref, uint32_t entry, WT_VSTUFF *vs)
+{
+ WT_BTREE *btree;
+ WT_ITEM item;
+ int cmp;
+
+ btree = S2BT(session);
+
+ /* The maximum key is set, we updated it from a leaf page first. */
+ WT_ASSERT(session, vs->max_addr->size != 0);
+
+ /* Get the parent page's internal key. */
+ __wt_ref_key(parent, ref, &item.data, &item.size);
+
+ /* Compare the key against the largest key we've seen so far. */
+ WT_RET(__wt_compare(
+ session, btree->collator, &item, vs->max_key, &cmp));
+ if (cmp <= 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "the internal key in entry %" PRIu32 " on the page at %s "
+ "sorts before the last key appearing on page %s, earlier "
+ "in the tree",
+ entry,
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ (char *)vs->max_addr->data);
+
+ /* Update the largest key we've seen to the key just checked. */
+ WT_RET(__wt_buf_set(session, vs->max_key, item.data, item.size));
+ (void)__wt_page_addr_string(session, ref, vs->max_addr);
+
+ return (0);
+}
+
+/*
+ * __verify_row_leaf_key_order --
+ * Compare the first key on a leaf page to the largest key we've seen so
+ * far; update the largest key we've seen so far to the last key on the page.
+ */
+static int
+__verify_row_leaf_key_order(
+ WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ int cmp;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * If a tree is empty (just created), it won't have keys; if there
+ * are no keys, we're done.
+ */
+ if (page->pg_row_entries == 0)
+ return (0);
+
+ /*
+ * We visit our first leaf page before setting the maximum key (the 0th
+ * keys on the internal pages leading to the smallest leaf in the tree
+ * are all empty entries).
+ */
+ if (vs->max_addr->size != 0) {
+ WT_RET(__wt_row_leaf_key_copy(
+ session, page, page->pg_row_d, vs->tmp1));
+
+ /*
+ * Compare the key against the largest key we've seen so far.
+ *
+ * If we're comparing against a key taken from an internal page,
+ * we can compare equal (which is an expected path, the internal
+ * page key is often a copy of the leaf page's first key). But,
+ * in the case of the 0th slot on an internal page, the last key
+ * we've seen was a key from a previous leaf page, and it's not
+ * OK to compare equally in that case.
+ */
+ WT_RET(__wt_compare(session,
+ btree->collator, vs->tmp1, (WT_ITEM *)vs->max_key, &cmp));
+ if (cmp < 0)
+ WT_RET_MSG(session, WT_ERROR,
+ "the first key on the page at %s sorts equal to or "
+ "less than a key appearing on the page at %s, "
+ "earlier in the tree",
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ (char *)vs->max_addr->data);
+ }
+
+ /* Update the largest key we've seen to the last key on this page. */
+ WT_RET(__wt_row_leaf_key_copy(session, page,
+ page->pg_row_d + (page->pg_row_entries - 1), vs->max_key));
+ (void)__wt_page_addr_string(session, ref, vs->max_addr);
+
+ return (0);
+}
+
+/*
+ * __verify_overflow_cell --
+ * Verify any overflow cells on the page.
+ */
+static int
+__verify_overflow_cell(
+ WT_SESSION_IMPL *session, WT_REF *ref, int *found, WT_VSTUFF *vs)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ uint32_t cell_num, i;
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ *found = 0;
+
+ /*
+ * If a tree is empty (just created), it won't have a disk image;
+ * if there is no disk image, we're done.
+ */
+ if ((dsk = ref->page->dsk) == NULL)
+ return (0);
+
+ /* Walk the disk page, verifying pages referenced by overflow cells. */
+ cell_num = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_VALUE_OVFL:
+ *found = 1;
+ WT_ERR(__verify_overflow(
+ session, unpack->data, unpack->size, vs));
+ break;
+ }
+ }
+
+ return (0);
+
+err: WT_RET_MSG(session, ret,
+ "cell %" PRIu32 " on page at %s references an overflow item at %s "
+ "that failed verification",
+ cell_num - 1,
+ __wt_page_addr_string(session, ref, vs->tmp1),
+ __wt_addr_string(session, unpack->data, unpack->size, vs->tmp2));
+}
+
+/*
+ * __verify_overflow --
+ * Read in an overflow page and check it.
+ */
+static int
+__verify_overflow(WT_SESSION_IMPL *session,
+ const uint8_t *addr, size_t addr_size, WT_VSTUFF *vs)
+{
+ WT_BM *bm;
+ const WT_PAGE_HEADER *dsk;
+
+ bm = S2BT(session)->bm;
+
+ /* Read and verify the overflow item. */
+ WT_RET(__wt_bt_read(session, vs->tmp1, addr, addr_size));
+
+ /*
+ * The physical page has already been verified, but we haven't confirmed
+ * it was an overflow page, only that it was a valid page. Confirm it's
+ * the type of page we expected.
+ */
+ dsk = vs->tmp1->data;
+ if (dsk->type != WT_PAGE_OVFL)
+ WT_RET_MSG(session, WT_ERROR,
+ "overflow referenced page at %s is not an overflow page",
+ __wt_addr_string(session, addr, addr_size, vs->tmp1));
+
+ WT_RET(bm->verify_addr(bm, session, addr, addr_size));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
new file mode 100644
index 00000000000..a14f9f1078e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy_dsk.c
@@ -0,0 +1,739 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __err_cell_corrupted(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __err_cell_type(
+ WT_SESSION_IMPL *, uint32_t, const char *, uint8_t, uint8_t);
+static int __err_eof(WT_SESSION_IMPL *, uint32_t, const char *);
+static int __verify_dsk_chunk(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, uint32_t);
+static int __verify_dsk_col_fix(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_col_int(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_col_var(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+static int __verify_dsk_memsize(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *, WT_CELL *);
+static int __verify_dsk_row(
+ WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
+
+#define WT_ERR_VRFY(session, ...) do { \
+ if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \
+ __wt_errx(session, __VA_ARGS__); \
+ goto err; \
+} while (0)
+
+#define WT_RET_VRFY(session, ...) do { \
+ if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \
+ __wt_errx(session, __VA_ARGS__); \
+ return (WT_ERROR); \
+} while (0)
+
+/*
+ * __wt_verify_dsk_image --
+ * Verify a single block as read from disk.
+ */
+int
+__wt_verify_dsk_image(WT_SESSION_IMPL *session,
+ const char *addr, const WT_PAGE_HEADER *dsk, size_t size)
+{
+ const uint8_t *p, *end;
+ u_int i;
+ uint8_t flags;
+
+ /* Check the page type. */
+ switch (dsk->type) {
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_OVFL:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ break;
+ case WT_PAGE_INVALID:
+ default:
+ WT_RET_VRFY(session,
+ "page at %s has an invalid type of %" PRIu32,
+ addr, dsk->type);
+ }
+
+ /* Check the page record number. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ if (dsk->recno != 0)
+ break;
+ WT_RET_VRFY(session,
+ "%s page at %s has a record number of zero",
+ __wt_page_type_string(dsk->type), addr);
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_OVFL:
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ if (dsk->recno == 0)
+ break;
+ WT_RET_VRFY(session,
+ "%s page at %s has a non-zero record number",
+ __wt_page_type_string(dsk->type), addr);
+ }
+
+ /* Check the page flags. */
+ flags = dsk->flags;
+ if (LF_ISSET(WT_PAGE_COMPRESSED))
+ LF_CLR(WT_PAGE_COMPRESSED);
+ if (dsk->type == WT_PAGE_ROW_LEAF) {
+ if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
+ LF_ISSET(WT_PAGE_EMPTY_V_NONE))
+ WT_RET_VRFY(session,
+ "page at %s has invalid flags combination: 0x%"
+ PRIx8,
+ addr, dsk->flags);
+ if (LF_ISSET(WT_PAGE_EMPTY_V_ALL))
+ LF_CLR(WT_PAGE_EMPTY_V_ALL);
+ if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
+ LF_CLR(WT_PAGE_EMPTY_V_NONE);
+ }
+ if (flags != 0)
+ WT_RET_VRFY(session,
+ "page at %s has invalid flags set: 0x%" PRIx8,
+ addr, flags);
+
+ /* Unused bytes */
+ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
+ if (*p != '\0')
+ WT_RET_VRFY(session,
+ "page at %s has non-zero unused page header bytes",
+ addr);
+
+ /*
+ * Any bytes after the data chunk should be nul bytes; ignore if the
+ * size is 0, that allows easy checking of disk images where we don't
+ * have the size.
+ */
+ if (size != 0) {
+ p = (uint8_t *)dsk + dsk->mem_size;
+ end = (uint8_t *)dsk + size;
+ for (; p < end; ++p)
+ if (*p != '\0')
+ WT_RET_VRFY(session,
+ "%s page at %s has non-zero trailing bytes",
+ __wt_page_type_string(dsk->type), addr);
+ }
+
+ /* Verify the items on the page. */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ return (__verify_dsk_col_int(session, addr, dsk));
+ case WT_PAGE_COL_FIX:
+ return (__verify_dsk_col_fix(session, addr, dsk));
+ case WT_PAGE_COL_VAR:
+ return (__verify_dsk_col_var(session, addr, dsk));
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ return (__verify_dsk_row(session, addr, dsk));
+ case WT_PAGE_BLOCK_MANAGER:
+ case WT_PAGE_OVFL:
+ return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
+ WT_ILLEGAL_VALUE(session);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_verify_dsk --
+ * Verify a single Btree page as read from disk.
+ */
+int
+__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
+{
+ return (__wt_verify_dsk_image(session, addr, buf->data, buf->size));
+}
+
+/*
+ * __verify_dsk_row --
+ * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
+ */
+static int
+__verify_dsk_row(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(current);
+ WT_DECL_ITEM(last_ovfl);
+ WT_DECL_ITEM(last_pfx);
+ WT_DECL_RET;
+ WT_ITEM *last;
+ enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
+ void *huffman;
+ uint32_t cell_num, cell_type, i, key_cnt, prefix;
+ uint8_t *end;
+ int cmp;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+ huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &current));
+ WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
+ WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
+ last = last_ovfl;
+
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ last_cell_type = FIRST;
+ cell_num = 0;
+ key_cnt = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+
+ /* Carefully unpack the cell. */
+ if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
+ ret = __err_cell_corrupted(session, cell_num, addr);
+ goto err;
+ }
+
+ /* Check the raw and collapsed cell types. */
+ WT_ERR(__err_cell_type(
+ session, cell_num, addr, unpack->raw, dsk->type));
+ WT_ERR(__err_cell_type(
+ session, cell_num, addr, unpack->type, dsk->type));
+ cell_type = unpack->type;
+
+ /*
+ * Check ordering relationships between the WT_CELL entries.
+ * For row-store internal pages, check for:
+ * two values in a row,
+ * two keys in a row,
+ * a value as the first cell on a page.
+ * For row-store leaf pages, check for:
+ * two values in a row,
+ * a value as the first cell on a page.
+ */
+ switch (cell_type) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ ++key_cnt;
+ switch (last_cell_type) {
+ case FIRST:
+ case WAS_VALUE:
+ break;
+ case WAS_KEY:
+ if (dsk->type == WT_PAGE_ROW_LEAF)
+ break;
+ WT_ERR_VRFY(session,
+ "cell %" PRIu32 " on page at %s is the "
+ "first of two adjacent keys",
+ cell_num - 1, addr);
+ }
+ last_cell_type = WAS_KEY;
+ break;
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ switch (last_cell_type) {
+ case FIRST:
+ WT_ERR_VRFY(session,
+ "page at %s begins with a value", addr);
+ case WAS_KEY:
+ break;
+ case WAS_VALUE:
+ WT_ERR_VRFY(session,
+ "cell %" PRIu32 " on page at %s is the "
+ "first of two adjacent values",
+ cell_num - 1, addr);
+ }
+ last_cell_type = WAS_VALUE;
+ break;
+ }
+
+ /* Check if any referenced item has a valid address. */
+ switch (cell_type) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_VALUE_OVFL:
+ if (!bm->addr_valid(bm,
+ session, unpack->data, unpack->size))
+ goto eof;
+ break;
+ }
+
+ /*
+ * Remaining checks are for key order and prefix compression.
+ * If this cell isn't a key, we're done, move to the next cell.
+ * If this cell is an overflow item, instantiate the key and
+ * compare it with the last key. Otherwise, we have to deal
+ * with prefix compression.
+ */
+ switch (cell_type) {
+ case WT_CELL_KEY:
+ break;
+ case WT_CELL_KEY_OVFL:
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, dsk->type, unpack, current));
+ goto key_compare;
+ default:
+ /* Not a key -- continue with the next cell. */
+ continue;
+ }
+
+ /*
+ * Prefix compression checks.
+ *
+ * Confirm the first non-overflow key on a page has a zero
+ * prefix compression count.
+ */
+ prefix = unpack->prefix;
+ if (last_pfx->size == 0 && prefix != 0)
+ WT_ERR_VRFY(session,
+ "the %" PRIu32 " key on page at %s is the first "
+ "non-overflow key on the page and has a non-zero "
+ "prefix compression value",
+ cell_num, addr);
+
+ /* Confirm the prefix compression count is possible. */
+ if (cell_num > 1 && prefix > last->size)
+ WT_ERR_VRFY(session,
+ "key %" PRIu32 " on page at %s has a prefix "
+ "compression count of %" PRIu32 ", larger than "
+ "the length of the previous key, %" WT_SIZET_FMT,
+ cell_num, addr, prefix, last->size);
+
+ /*
+ * If Huffman decoding required, unpack the cell to build the
+ * key, then resolve the prefix. Else, we can do it faster
+ * internally because we don't have to shuffle memory around as
+ * much.
+ */
+ if (huffman != NULL) {
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, dsk->type, unpack, current));
+
+ /*
+ * If there's a prefix, make sure there's enough buffer
+ * space, then shift the decoded data past the prefix
+ * and copy the prefix into place. Take care with the
+ * pointers: current->data may be pointing inside the
+ * buffer.
+ */
+ if (prefix != 0) {
+ WT_ERR(__wt_buf_grow(
+ session, current, prefix + current->size));
+ memmove((uint8_t *)current->mem + prefix,
+ current->data, current->size);
+ memcpy(current->mem, last->data, prefix);
+ current->data = current->mem;
+ current->size += prefix;
+ }
+ } else {
+ /*
+ * Get the cell's data/length and make sure we have
+ * enough buffer space.
+ */
+ WT_ERR(__wt_buf_init(
+ session, current, prefix + unpack->size));
+
+ /* Copy the prefix then the data into place. */
+ if (prefix != 0)
+ memcpy(current->mem, last->data, prefix);
+ memcpy((uint8_t *)current->mem + prefix, unpack->data,
+ unpack->size);
+ current->size = prefix + unpack->size;
+ }
+
+key_compare: /*
+ * Compare the current key against the last key.
+ *
+ * Be careful about the 0th key on internal pages: we only store
+ * the first byte and custom collators may not be able to handle
+ * truncated keys.
+ */
+ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
+ (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
+ WT_ERR(__wt_compare(
+ session, btree->collator, last, current, &cmp));
+ if (cmp >= 0)
+ WT_ERR_VRFY(session,
+ "the %" PRIu32 " and %" PRIu32 " keys on "
+ "page at %s are incorrectly sorted",
+ cell_num - 2, cell_num, addr);
+ }
+
+ /*
+ * Swap the buffers: last always references the last key entry,
+ * last_pfx and last_ovfl reference the last prefix-compressed
+ * and last overflow key entries. Current gets pointed to the
+ * buffer we're not using this time around, which is where the
+ * next key goes.
+ */
+ last = current;
+ if (cell_type == WT_CELL_KEY) {
+ current = last_pfx;
+ last_pfx = last;
+ } else {
+ current = last_ovfl;
+ last_ovfl = last;
+ }
+ WT_ASSERT(session, last != current);
+ }
+ WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell));
+
+ /*
+ * On row-store internal pages, and on row-store leaf pages, where the
+ * "no empty values" flag is set, the key count should be equal to half
+ * the number of physical entries. On row-store leaf pages where the
+ * "all empty values" flag is set, the key count should be equal to the
+ * number of physical entries.
+ */
+ if (dsk->type == WT_PAGE_ROW_INT && key_cnt * 2 != dsk->u.entries)
+ WT_ERR_VRFY(session,
+ "%s page at %s has a key count of %" PRIu32 " and a "
+ "physical entry count of %" PRIu32,
+ __wt_page_type_string(dsk->type),
+ addr, key_cnt, dsk->u.entries);
+ if (dsk->type == WT_PAGE_ROW_LEAF &&
+ F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) &&
+ key_cnt != dsk->u.entries)
+ WT_ERR_VRFY(session,
+ "%s page at %s with the 'all empty values' flag set has a "
+ "key count of %" PRIu32 " and a physical entry count of %"
+ PRIu32,
+ __wt_page_type_string(dsk->type),
+ addr, key_cnt, dsk->u.entries);
+ if (dsk->type == WT_PAGE_ROW_LEAF &&
+ F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) &&
+ key_cnt * 2 != dsk->u.entries)
+ WT_ERR_VRFY(session,
+ "%s page at %s with the 'no empty values' flag set has a "
+ "key count of %" PRIu32 " and a physical entry count of %"
+ PRIu32,
+ __wt_page_type_string(dsk->type),
+ addr, key_cnt, dsk->u.entries);
+
+ if (0) {
+eof: ret = __err_eof(session, cell_num, addr);
+ }
+
+ if (0) {
+err: if (ret == 0)
+ ret = WT_ERROR;
+ }
+ __wt_scr_free(&current);
+ __wt_scr_free(&last_pfx);
+ __wt_scr_free(&last_ovfl);
+ return (ret);
+}
+
+/*
+ * __verify_dsk_col_int --
+ * Walk a WT_PAGE_COL_INT disk page and verify it.
+ */
+static int
+__verify_dsk_col_int(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ uint32_t cell_num, i;
+ uint8_t *end;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ cell_num = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+
+ /* Carefully unpack the cell. */
+ if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
+ return (__err_cell_corrupted(session, cell_num, addr));
+
+ /* Check the raw and collapsed cell types. */
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->raw, dsk->type));
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->type, dsk->type));
+
+ /* Check if any referenced item is entirely in the file. */
+ if (!bm->addr_valid(bm, session, unpack->data, unpack->size))
+ return (__err_eof(session, cell_num, addr));
+ }
+ WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+
+ return (0);
+}
+
+/*
+ * __verify_dsk_col_fix --
+ * Walk a WT_PAGE_COL_FIX disk page and verify it.
+ */
+static int
+__verify_dsk_col_fix(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BTREE *btree;
+ uint32_t datalen;
+
+ btree = S2BT(session);
+
+ datalen = __bitstr_size(btree->bitcnt * dsk->u.entries);
+ return (__verify_dsk_chunk(session, addr, dsk, datalen));
+}
+
+/*
+ * __verify_dsk_col_var --
+ * Walk a WT_PAGE_COL_VAR disk page and verify it.
+ */
+static int
+__verify_dsk_col_var(
+ WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ size_t last_size;
+ uint32_t cell_num, cell_type, i;
+ int last_deleted;
+ const uint8_t *last_data;
+ uint8_t *end;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ unpack = &_unpack;
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ last_data = NULL;
+ last_size = 0;
+ last_deleted = 0;
+
+ cell_num = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++cell_num;
+
+ /* Carefully unpack the cell. */
+ if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
+ return (__err_cell_corrupted(session, cell_num, addr));
+
+ /* Check the raw and collapsed cell types. */
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->raw, dsk->type));
+ WT_RET(__err_cell_type(
+ session, cell_num, addr, unpack->type, dsk->type));
+ cell_type = unpack->type;
+
+ /* Check if any referenced item is entirely in the file. */
+ if (cell_type == WT_CELL_VALUE_OVFL &&
+ !bm->addr_valid(bm, session, unpack->data, unpack->size))
+ return (__err_eof(session, cell_num, addr));
+
+ /*
+ * Compare the last two items and see if reconciliation missed
+ * a chance for RLE encoding. We don't have to care about data
+ * encoding or anything else, a byte comparison is enough.
+ */
+ if (last_deleted == 1) {
+ if (cell_type == WT_CELL_DEL)
+ goto match_err;
+ } else
+ if (cell_type == WT_CELL_VALUE &&
+ last_data != NULL &&
+ last_size == unpack->size &&
+ memcmp(last_data, unpack->data, last_size) == 0)
+match_err: WT_RET_VRFY(session,
+ "data entries %" PRIu32 " and %" PRIu32
+ " on page at %s are identical and should "
+ "have been run-length encoded",
+ cell_num - 1, cell_num, addr);
+
+ switch (cell_type) {
+ case WT_CELL_DEL:
+ last_deleted = 1;
+ last_data = NULL;
+ break;
+ case WT_CELL_VALUE_OVFL:
+ last_deleted = 0;
+ last_data = NULL;
+ break;
+ case WT_CELL_VALUE:
+ last_deleted = 0;
+ last_data = unpack->data;
+ last_size = unpack->size;
+ break;
+ }
+ }
+ WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+
+ return (0);
+}
+
+/*
+ * __verify_dsk_memsize --
+ * Verify the last cell on the page matches the page's memory size.
+ */
+static int
+__verify_dsk_memsize(WT_SESSION_IMPL *session,
+ const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
+{
+ size_t len;
+
+ /*
+ * We use the fact that cells exactly fill a page to detect the case of
+ * a row-store leaf page where the last cell is a key (that is, there's
+ * no subsequent value cell). Check for any page type containing cells.
+ */
+ len = WT_PTRDIFF((uint8_t *)dsk + dsk->mem_size, cell);
+ if (len == 0)
+ return (0);
+ WT_RET_VRFY(session,
+ "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data "
+ "after the last cell",
+ __wt_page_type_string(dsk->type), addr, len);
+}
+
+/*
+ * __verify_dsk_chunk --
+ * Verify a Chunk O' Data on a Btree page.
+ */
+static int
+__verify_dsk_chunk(WT_SESSION_IMPL *session,
+ const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen)
+{
+ WT_BTREE *btree;
+ uint8_t *p, *end;
+
+ btree = S2BT(session);
+ end = (uint8_t *)dsk + dsk->mem_size;
+
+ /*
+ * Fixed-length column-store and overflow pages are simple chunks of
+ * data.
+ */
+ if (datalen == 0)
+ WT_RET_VRFY(session,
+ "%s page at %s has no data",
+ __wt_page_type_string(dsk->type), addr);
+
+ /* Verify the data doesn't overflow the end of the page. */
+ p = WT_PAGE_HEADER_BYTE(btree, dsk);
+ if (p + datalen > end)
+ WT_RET_VRFY(session,
+ "data on page at %s extends past the end of the page",
+ addr);
+
+ /* Any bytes after the data chunk should be nul bytes. */
+ for (p += datalen; p < end; ++p)
+ if (*p != '\0')
+ WT_RET_VRFY(session,
+ "%s page at %s has non-zero trailing bytes",
+ __wt_page_type_string(dsk->type), addr);
+
+ return (0);
+}
+
+/*
+ * __err_cell_corrupted --
+ * Generic corrupted cell, we couldn't read it.
+ */
+static int
+__err_cell_corrupted(
+ WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+{
+ WT_RET_VRFY(session,
+ "item %" PRIu32 " on page at %s is a corrupted cell",
+ entry_num, addr);
+}
+
+/*
+ * __err_cell_type --
+ * Generic illegal cell type for a particular page type error.
+ */
+static int
+__err_cell_type(WT_SESSION_IMPL *session,
+ uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type)
+{
+ switch (cell_type) {
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ if (dsk_type == WT_PAGE_COL_INT ||
+ dsk_type == WT_PAGE_ROW_INT)
+ return (0);
+ break;
+ case WT_CELL_DEL:
+ if (dsk_type == WT_PAGE_COL_VAR)
+ return (0);
+ break;
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_SHORT:
+ if (dsk_type == WT_PAGE_ROW_INT ||
+ dsk_type == WT_PAGE_ROW_LEAF)
+ return (0);
+ break;
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_KEY_SHORT_PFX:
+ if (dsk_type == WT_PAGE_ROW_LEAF)
+ return (0);
+ break;
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL_RM:
+ /*
+ * Removed overflow cells are in-memory only, it's an error to
+ * ever see one on a disk page.
+ */
+ break;
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_COPY:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_SHORT:
+ if (dsk_type == WT_PAGE_COL_VAR ||
+ dsk_type == WT_PAGE_ROW_LEAF)
+ return (0);
+ break;
+ default:
+ break;
+ }
+
+ WT_RET_VRFY(session,
+ "illegal cell and page type combination: cell %" PRIu32
+ " on page at %s is a %s cell on a %s page",
+ entry_num, addr,
+ __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type));
+}
+
+/*
+ * __err_eof --
+ * Generic item references non-existent file pages error.
+ */
+static int
+__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+{
+ WT_RET_VRFY(session,
+ "off-page item %" PRIu32
+ " on page at %s references non-existent file pages",
+ entry_num, addr);
+}
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
new file mode 100644
index 00000000000..ef35d215ec0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -0,0 +1,285 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_tree_walk --
+ * Move to the next/previous page in the tree.
+ */
+int
+__wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *couple, *ref;
+ WT_TXN_STATE *txn_state;
+ int descending, prev, skip;
+ uint32_t slot;
+
+ btree = S2BT(session);
+ descending = 0;
+
+ /*
+ * Tree walks are special: they look inside page structures that splits
+ * may want to free. Publish that the tree is active during this
+ * window.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+
+ /*
+ * !!!
+ * Fast-truncate currently only works on row-store trees.
+ */
+ if (btree->type != BTREE_ROW)
+ LF_CLR(WT_READ_TRUNCATE);
+
+ prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;
+
+ /*
+ * Pin a transaction ID, required to safely look at page index
+ * structures, if our caller has not already done so.
+ */
+ txn_state = WT_SESSION_TXN_STATE(session);
+ if (txn_state->snap_min == WT_TXN_NONE)
+ txn_state->snap_min = S2C(session)->txn_global.last_running;
+ else
+ txn_state = NULL;
+
+ /*
+ * There are multiple reasons and approaches to walking the in-memory
+ * tree:
+ *
+ * (1) finding pages to evict (the eviction server);
+ * (2) writing just dirty leaves or internal nodes (checkpoint);
+ * (3) discarding pages (close);
+ * (4) truncating pages in a range (fast truncate);
+ * (5) skipping pages based on outside information (compaction);
+ * (6) cursor scans (applications).
+ *
+ * Except for cursor scans and compaction, the walk is limited to the
+ * cache, no pages are read. In all cases, hazard pointers protect the
+ * walked pages from eviction.
+ *
+ * Walks use hazard-pointer coupling through the tree and that's OK
+ * (hazard pointers can't deadlock, so there's none of the usual
+ * problems found when logically locking up a btree). If the eviction
+ * thread tries to evict the active page, it fails because of our
+ * hazard pointer. If eviction tries to evict our parent, that fails
+ * because the parent has a child page that can't be discarded. We do
+ * play one game: don't couple up to our parent and then back down to a
+ * new leaf, couple to the next page to which we're descending, it
+ * saves a hazard-pointer swap for each cursor page movement.
+ *
+ * !!!
+ * NOTE: we depend on the fact it's OK to release a page we don't hold,
+ * that is, it's OK to release couple when couple is set to NULL.
+ *
+ * Take a copy of any held page and clear the return value. Remember
+ * the hazard pointer we're currently holding.
+ *
+ * We may be passed a pointer to btree->evict_page that we are clearing
+ * here. We check when discarding pages that we're not discarding that
+ * page, so this clear must be done before the page is released.
+ */
+ couple = ref = *refp;
+ *refp = NULL;
+
+ /* If no page is active, begin a walk from the start of the tree. */
+ if (ref == NULL) {
+ ref = &btree->root;
+ if (ref->page == NULL) {
+ if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+ goto done;
+ }
+ goto descend;
+ }
+
+ascend: /*
+ * If the active page was the root, we've reached the walk's end.
+ * Release any hazard-pointer we're holding.
+ */
+ if (__wt_ref_is_root(ref)) {
+ WT_ERR(__wt_page_release(session, couple, flags));
+ goto done;
+ }
+
+ /* Figure out the current slot in the WT_REF array. */
+ __wt_page_refp(session, ref, &pindex, &slot);
+
+ if (0) {
+restart: /*
+ * The page we're moving to might have split, in which case find
+ * the last position we held.
+ *
+ * If we were starting a tree walk, begin again.
+ *
+ * If we were in the process of descending, repeat the descent.
+ * If we were moving within a single level of the tree, repeat
+ * the last move.
+ */
+ ref = couple;
+ if (ref == &btree->root) {
+ ref = &btree->root;
+ if (ref->page == NULL) {
+ if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+ goto done;
+ }
+ goto descend;
+ }
+ __wt_page_refp(session, ref, &pindex, &slot);
+ if (descending)
+ goto descend;
+ }
+
+ for (;;) {
+ /*
+ * If we're at the last/first slot on the page, return this page
+ * in post-order traversal. Otherwise we move to the next/prev
+ * slot and left/right-most element in its subtree.
+ */
+ if ((prev && slot == 0) ||
+ (!prev && slot == pindex->entries - 1)) {
+ ref = ref->home->pg_intl_parent_ref;
+
+ /* Optionally skip internal pages. */
+ if (LF_ISSET(WT_READ_SKIP_INTL))
+ goto ascend;
+
+ /*
+ * We've ascended the tree and are returning an internal
+ * page. If it's the root, discard our hazard pointer,
+ * otherwise, swap our hazard pointer for the page we'll
+ * return.
+ */
+ if (__wt_ref_is_root(ref))
+ WT_ERR(__wt_page_release(
+ session, couple, flags));
+ else {
+ /*
+ * Locate the reference to our parent page then
+ * swap our child hazard pointer for the parent.
+ * We don't handle a restart return because it
+ * would require additional complexity in the
+ * restart code (ascent code somewhat like the
+ * descent code already there), and it's not a
+ * possible return: we're moving to the parent
+ * of the current child, not another child of
+ * the same parent, there's no way our parent
+ * split.
+ */
+ __wt_page_refp(session, ref, &pindex, &slot);
+ if ((ret = __wt_page_swap(
+ session, couple, ref, flags)) != 0) {
+ WT_TRET(__wt_page_release(
+ session, couple, flags));
+ WT_ERR(ret);
+ }
+ }
+
+ *refp = ref;
+ goto done;
+ }
+
+ if (prev)
+ --slot;
+ else
+ ++slot;
+
+ for (descending = 0;;) {
+ ref = pindex->index[slot];
+
+ if (LF_ISSET(WT_READ_CACHE)) {
+ /*
+ * Only look at unlocked pages in memory:
+ * fast-path some common cases.
+ */
+ if (LF_ISSET(WT_READ_NO_WAIT) &&
+ ref->state != WT_REF_MEM)
+ break;
+ } else if (LF_ISSET(WT_READ_TRUNCATE)) {
+ /*
+ * If deleting a range, try to delete the page
+ * without instantiating it.
+ */
+ WT_ERR(__wt_delete_page(session, ref, &skip));
+ if (skip)
+ break;
+ } else if (LF_ISSET(WT_READ_COMPACT)) {
+ /*
+ * Skip deleted pages, rewriting them doesn't
+ * seem useful.
+ */
+ if (ref->state == WT_REF_DELETED)
+ break;
+
+ /*
+ * If the page is in-memory, we want to look at
+ * it (it may have been modified and written,
+ * and the current location is the interesting
+ * one in terms of compaction, not the original
+ * location). If the page isn't in-memory, test
+ * if the page will help with compaction, don't
+ * read it if we don't have to.
+ */
+ if (ref->state == WT_REF_DISK) {
+ WT_ERR(__wt_compact_page_skip(
+ session, ref, &skip));
+ if (skip)
+ break;
+ }
+ } else {
+ /*
+ * If iterating a cursor, try to skip deleted
+ * pages that are visible to us.
+ */
+ if (ref->state == WT_REF_DELETED &&
+ __wt_delete_page_skip(session, ref))
+ break;
+ }
+
+ ret = __wt_page_swap(session, couple, ref, flags);
+ if (ret == WT_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ if (ret == WT_RESTART)
+ goto restart;
+ WT_ERR(ret);
+
+ /*
+ * Entering a new page: configure for traversal of any
+ * internal page's children, else return (or optionally
+ * skip), the leaf page.
+ */
+descend: couple = ref;
+ page = ref->page;
+ if (page->type == WT_PAGE_ROW_INT ||
+ page->type == WT_PAGE_COL_INT) {
+ pindex = WT_INTL_INDEX_COPY(page);
+ slot = prev ? pindex->entries - 1 : 0;
+ descending = 1;
+ } else if (LF_ISSET(WT_READ_SKIP_LEAF))
+ goto ascend;
+ else {
+ *refp = ref;
+ goto done;
+ }
+ }
+ }
+
+done:
+err: if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+
+ WT_LEAVE_PAGE_INDEX(session);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c
new file mode 100644
index 00000000000..3a4a2a2987d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_modify.c
@@ -0,0 +1,223 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __col_insert_alloc(
+ WT_SESSION_IMPL *, uint64_t, u_int, WT_INSERT **, size_t *);
+
+/*
+ * __wt_col_modify --
+ * Column-store delete, insert, and update.
+ */
+int
+__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head, **ins_headp;
+ WT_ITEM _value;
+ WT_PAGE *page;
+ WT_UPDATE *old_upd;
+ size_t ins_size, upd_size;
+ u_int i, skipdepth;
+ int append, logged;
+
+ btree = cbt->btree;
+ ins = NULL;
+ page = cbt->ref->page;
+ append = logged = 0;
+
+ /* This code expects a remove to have a NULL value. */
+ if (is_remove) {
+ if (btree->type == BTREE_COL_FIX) {
+ value = &_value;
+ value->data = "";
+ value->size = 1;
+ } else
+ value = NULL;
+ } else {
+ /*
+ * There's some chance the application specified a record past
+ * the last record on the page. If that's the case, and we're
+ * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
+ * append list, not the update list. In addition, a recno of
+ * 0 implies an append operation, we're allocating a new row.
+ */
+ if (recno == 0 ||
+ recno > (btree->type == BTREE_COL_VAR ?
+ __col_var_last_recno(page) : __col_fix_last_recno(page)))
+ append = 1;
+ }
+
+ /* If we don't yet have a modify structure, we'll need one. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ /*
+ * Delete, insert or update a column-store entry.
+ *
+ * If modifying a previously modified record, create a new WT_UPDATE
+ * entry and have a serialized function link it into an existing
+ * WT_INSERT entry's WT_UPDATE list.
+ *
+ * Else, allocate an insert array as necessary, build a WT_INSERT and
+ * WT_UPDATE structure pair, and call a serialized function to insert
+ * the WT_INSERT structure.
+ */
+ if (cbt->compare == 0 && cbt->ins != NULL) {
+ /*
+ * If we are restoring updates that couldn't be evicted, the
+ * key must not exist on the new page.
+ */
+ WT_ASSERT(session, upd == NULL);
+
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = cbt->ins->upd));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid a data copy in WT_CURSOR.update. */
+ cbt->modify_update = upd;
+
+ /*
+ * Point the new WT_UPDATE item to the next element in the list.
+ * If we get it right, the serialization function lock acts as
+ * our memory barrier to flush this write.
+ */
+ upd->next = old_upd;
+
+ /* Serialize the update. */
+ WT_ERR(__wt_update_serial(
+ session, page, &cbt->ins->upd, &upd, upd_size));
+ } else {
+ /* Allocate the append/update list reference as necessary. */
+ if (append) {
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, page->modify->mod_append, ins_headp, 1);
+ ins_headp = &page->modify->mod_append[0];
+ } else if (page->type == WT_PAGE_COL_FIX) {
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, page->modify->mod_update, ins_headp, 1);
+ ins_headp = &page->modify->mod_update[0];
+ } else {
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, page->modify->mod_update, ins_headp,
+ page->pg_var_entries);
+ ins_headp = &page->modify->mod_update[cbt->slot];
+ }
+
+ /* Allocate the WT_INSERT_HEAD structure as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+ ins_head = *ins_headp;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
+ */
+ WT_ERR(__col_insert_alloc(
+ session, recno, skipdepth, &ins, &ins_size));
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid a data copy in WT_CURSOR.update. */
+ cbt->modify_update = upd;
+ } else
+ upd_size = sizeof(WT_UPDATE) + upd->size;
+ ins->upd = upd;
+ ins_size += upd_size;
+
+ /*
+ * If there was no insert list during the search, or there was
+ * no search because the record number has not been allocated
+ * yet, the cursor's information cannot be correct, search
+ * couldn't have initialized it.
+ *
+ * Otherwise, point the new WT_INSERT item's skiplist to the
+ * next elements in the insert list (which we will check are
+ * still valid inside the serialization function).
+ *
+ * The serial mutex acts as our memory barrier to flush these
+ * writes before inserting them into the list.
+ */
+ if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
+ for (i = 0; i < skipdepth; i++) {
+ cbt->ins_stack[i] = &ins_head->head[i];
+ ins->next[i] = cbt->next_stack[i] = NULL;
+ }
+ else
+ for (i = 0; i < skipdepth; i++)
+ ins->next[i] = cbt->next_stack[i];
+
+ /* Append or insert the WT_INSERT structure. */
+ if (append)
+ WT_ERR(__wt_col_append_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, &cbt->recno, skipdepth));
+ else
+ WT_ERR(__wt_insert_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, skipdepth));
+ }
+
+ /* If the update was successful, add it to the in-memory log. */
+ if (logged)
+ WT_ERR(__wt_txn_log_op(session, cbt));
+
+ if (0) {
+err: /*
+ * Remove the update from the current transaction, so we don't
+ * try to modify it on rollback.
+ */
+ if (logged)
+ __wt_txn_unmodify(session);
+ __wt_free(session, ins);
+ __wt_free(session, upd);
+ }
+
+ return (ret);
+}
+
+/*
+ * __col_insert_alloc --
+ * Column-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+static int
+__col_insert_alloc(WT_SESSION_IMPL *session,
+ uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+ WT_INSERT *ins;
+ size_t ins_size;
+
+ /*
+ * Allocate the WT_INSERT structure and skiplist pointers, then copy
+ * the record number into place.
+ */
+ ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *);
+ WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+ WT_INSERT_RECNO(ins) = recno;
+
+ *insp = ins;
+ *ins_sizep = ins_size;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c
new file mode 100644
index 00000000000..e4083e2282f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/col_srch.c
@@ -0,0 +1,199 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_col_search --
+ * Search a column-store tree for a specific record-based key.
+ */
+int
+__wt_col_search(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_COL *cip;
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ uint32_t base, indx, limit;
+ int depth;
+
+ btree = S2BT(session);
+
+ __cursor_pos_clear(cbt);
+
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf != NULL) {
+ current = leaf;
+ goto leaf_only;
+ }
+
+ /* Search the internal pages of the tree. */
+ current = &btree->root;
+ for (depth = 2;; ++depth) {
+restart: page = current->page;
+ if (page->type != WT_PAGE_COL_INT)
+ break;
+
+ WT_ASSERT(session, current->key.recno == page->pg_intl_recno);
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ base = pindex->entries;
+ descent = pindex->index[base - 1];
+
+ /* Fast path appends. */
+ if (recno >= descent->key.recno)
+ goto descend;
+
+ /* Binary search of internal pages. */
+ for (base = 0,
+ limit = pindex->entries - 1; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+
+ if (recno == descent->key.recno)
+ break;
+ if (recno < descent->key.recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+descend: /*
+ * Reference the slot used for next step down the tree.
+ *
+ * Base is the smallest index greater than recno and may be the
+ * (last + 1) index. The slot for descent is the one before
+ * base.
+ */
+ if (recno != descent->key.recno) {
+ /*
+ * We don't have to correct for base == 0 because the
+ * only way for base to be 0 is if recno is the page's
+ * starting recno.
+ */
+ WT_ASSERT(session, base > 0);
+ descent = pindex->index[base - 1];
+ }
+
+ /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search in the current
+ * page; otherwise return on error, the swap call ensures we're
+ * holding nothing on failure.
+ */
+ switch (ret = __wt_page_swap(session, current, descent, 0)) {
+ case 0:
+ current = descent;
+ break;
+ case WT_RESTART:
+ goto restart;
+ default:
+ return (ret);
+ }
+ }
+
+ /* Track how deep the tree gets. */
+ if (depth > btree->maximum_depth)
+ btree->maximum_depth = depth;
+
+leaf_only:
+ page = current->page;
+ cbt->ref = current;
+ cbt->recno = recno;
+ cbt->compare = 0;
+
+ /*
+ * Set the on-page slot to an impossible value larger than any possible
+ * slot (it's used to interpret the search function's return after the
+ * search returns an insert list for a page that has no entries).
+ */
+ cbt->slot = UINT32_MAX;
+
+ /*
+ * Search the leaf page. We do not check in the search path for a
+ * record greater than the maximum record in the tree; in that case,
+ * we arrive here with a record that's impossibly large for the page.
+ */
+ if (page->type == WT_PAGE_COL_FIX) {
+ if (recno >= page->pg_fix_recno + page->pg_fix_entries) {
+ cbt->recno = page->pg_fix_recno + page->pg_fix_entries;
+ goto past_end;
+ } else
+ ins_head = WT_COL_UPDATE_SINGLE(page);
+ } else
+ if ((cip = __col_var_search(page, recno)) == NULL) {
+ cbt->recno = __col_var_last_recno(page);
+ goto past_end;
+ } else {
+ cbt->slot = WT_COL_SLOT(page, cip);
+ ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
+ }
+
+ /*
+ * We have a match on the page, check for an update. Check the page's
+ * update list (fixed-length), or slot's update list (variable-length)
+ * for a better match. The only better match we can find is an exact
+ * match, otherwise the existing match on the page is the one we want.
+ * For that reason, don't set the cursor's WT_INSERT_HEAD/WT_INSERT pair
+ * until we know we have a useful entry.
+ */
+ if ((ins = __col_insert_search(
+ ins_head, cbt->ins_stack, cbt->next_stack, recno)) != NULL)
+ if (recno == WT_INSERT_RECNO(ins)) {
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+ }
+ return (0);
+
+past_end:
+ /*
+ * A record past the end of the page's standard information. Check the
+ * append list; by definition, any record on the append list is closer
+ * than the last record on the page, so it's a better choice for return.
+ * This is a rarely used path: we normally find exact matches, because
+ * column-store files are dense, but in this case the caller searched
+ * past the end of the table.
+ *
+ * Don't bother searching if the caller is appending a new record where
+ * we'll allocate the record number; we're not going to find a match by
+ * definition, and we figure out the position when we do the work.
+ */
+ cbt->ins_head = WT_COL_APPEND(page);
+ if (recno == UINT64_MAX)
+ cbt->ins = NULL;
+ else
+ cbt->ins = __col_insert_search(
+ cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
+ if (cbt->ins == NULL)
+ cbt->compare = -1;
+ else {
+ cbt->recno = WT_INSERT_RECNO(cbt->ins);
+ if (recno == cbt->recno)
+ cbt->compare = 0;
+ else if (recno < cbt->recno)
+ cbt->compare = 1;
+ else
+ cbt->compare = -1;
+ }
+
+ /*
+ * Note if the record is past the maximum record in the tree, the cursor
+ * search functions need to know for fixed-length column-stores because
+ * appended records implicitly create any skipped records, and cursor
+ * search functions have to handle that case.
+ */
+ if (cbt->compare == -1)
+ F_SET(cbt, WT_CBT_MAX_RECORD);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_evict.c b/src/third_party/wiredtiger/src/btree/rec_evict.c
new file mode 100644
index 00000000000..4696e78059e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_evict.c
@@ -0,0 +1,468 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __hazard_exclusive(WT_SESSION_IMPL *, WT_REF *, int);
+static void __rec_discard_tree(WT_SESSION_IMPL *, WT_REF *, int, int);
+static void __rec_excl_clear(WT_SESSION_IMPL *);
+static void __rec_page_clean_update(WT_SESSION_IMPL *, WT_REF *);
+static int __rec_page_dirty_update(WT_SESSION_IMPL *, WT_REF *, int);
+static int __rec_review(WT_SESSION_IMPL *, WT_REF *, int, int, int *);
+
+/*
+ * __wt_rec_evict --
+ * Reconciliation plus eviction.
+ */
+int
+__wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_TXN_STATE *txn_state;
+ int istree;
+
+ page = ref->page;
+ istree = 0;
+
+ WT_RET(__wt_verbose(session, WT_VERB_EVICT,
+ "page %p (%s)", page, __wt_page_type_string(page->type)));
+
+ /*
+ * Pin the oldest transaction ID: eviction looks at page structures
+ * that are freed when no transaction in the system needs them.
+ */
+ txn_state = WT_SESSION_TXN_STATE(session);
+ if (txn_state->snap_min == WT_TXN_NONE)
+ txn_state->snap_min = S2C(session)->txn_global.oldest_id;
+ else
+ txn_state = NULL;
+
+ /*
+ * Get exclusive access to the page and review the page and its subtree
+ * for conditions that would block our eviction of the page. If the
+ * check fails (for example, we find a child page that can't be merged),
+ * we're done. We have to make this check for clean pages, too: while
+ * unlikely eviction would choose an internal page with children, it's
+ * not disallowed anywhere.
+ */
+ WT_ERR(__rec_review(session, ref, exclusive, 1, &istree));
+
+ /*
+ * Update the page's modification reference, reconciliation might have
+ * changed it.
+ */
+ mod = page->modify;
+
+ /* Count evictions of internal pages during normal operation. */
+ if (!exclusive &&
+ (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal);
+ }
+
+ /* Discard any subtree rooted in this page. */
+ if (istree)
+ __rec_discard_tree(session, ref, exclusive, 1);
+
+ /* Update the reference and discard the page. */
+ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) {
+ WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED);
+
+ if (__wt_ref_is_root(ref))
+ __wt_ref_out(session, ref);
+ else
+ __rec_page_clean_update(session, ref);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
+ } else {
+ if (__wt_ref_is_root(ref))
+ __wt_ref_out(session, ref);
+ else
+ WT_ERR(
+ __rec_page_dirty_update(session, ref, exclusive));
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty);
+ }
+
+ if (0) {
+err: /*
+ * If unable to evict this page, release exclusive reference(s)
+ * we've acquired.
+ */
+ if (!exclusive)
+ __rec_excl_clear(session);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail);
+ }
+ session->excl_next = 0;
+
+ if (txn_state != NULL)
+ txn_state->snap_min = WT_TXN_NONE;
+
+ return (ret);
+}
+
+/*
+ * __rec_page_clean_update --
+ * Update a clean page's reference on eviction.
+ */
+static void
+__rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ /*
+ * Discard the page and update the reference structure; if the page has
+ * an address, it's a disk page; if it has no address, it's a deleted
+ * page re-instantiated (for example, by searching) and never written.
+ */
+ __wt_ref_out(session, ref);
+ WT_PUBLISH(ref->state,
+ ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
+}
+
+/*
+ * __rec_page_dirty_update --
+ * Update a dirty page's reference on eviction.
+ */
+static int
+__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ WT_ADDR *addr;
+ WT_PAGE *parent;
+ WT_PAGE_MODIFY *mod;
+
+ parent = ref->home;
+ mod = ref->page->modify;
+
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY: /* Page is empty */
+ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+
+ /*
+ * Update the parent to reference a deleted page. The fact that
+ * reconciliation left the page "empty" means there's no older
+ * transaction in the system that might need to see an earlier
+ * version of the page. For that reason, we clear the address
+ * of the page, if we're forced to "read" into that namespace,
+ * we'll instantiate a new page instead of trying to read from
+ * the backing store.
+ *
+ * Publish: a barrier to ensure the structure fields are set
+ * before the state change makes the page available to readers.
+ */
+ __wt_ref_out(session, ref);
+ ref->addr = NULL;
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ break;
+ case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ /* Split the page in memory. */
+ WT_RET(__wt_split_evict(session, ref, exclusive));
+ break;
+ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+
+ /*
+ * Update the parent to reference the replacement page.
+ *
+ * Publish: a barrier to ensure the structure fields are set
+ * before the state change makes the page available to readers.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
+ *addr = mod->mod_replace;
+ mod->mod_replace.addr = NULL;
+ mod->mod_replace.size = 0;
+
+ __wt_ref_out(session, ref);
+ ref->addr = addr;
+ WT_PUBLISH(ref->state, WT_REF_DISK);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * __rec_discard_tree --
+ * Discard the tree rooted a page (that is, any pages merged into it),
+ * then the page itself.
+ */
+static void
+__rec_discard_tree(
+ WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top)
+{
+ WT_REF *child;
+
+ switch (ref->page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ /* For each entry in the page... */
+ WT_INTL_FOREACH_BEGIN(session, ref->page, child) {
+ if (child->state == WT_REF_DISK ||
+ child->state == WT_REF_DELETED)
+ continue;
+ WT_ASSERT(session,
+ exclusive || child->state == WT_REF_LOCKED);
+ __rec_discard_tree(session, child, exclusive, 0);
+ } WT_INTL_FOREACH_END;
+ /* FALLTHROUGH */
+ default:
+ if (!top)
+ __wt_ref_out(session, ref);
+ break;
+ }
+}
+
+/*
+ * __rec_review --
+ * Get exclusive access to the page and review the page and its subtree
+ * for conditions that would block its eviction.
+ */
+static int
+__rec_review(
+ WT_SESSION_IMPL *session, WT_REF *ref, int exclusive, int top, int *istree)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *child;
+ uint32_t flags;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ /*
+ * Get exclusive access to the page if our caller doesn't have the tree
+ * locked down.
+ */
+ if (!exclusive) {
+ WT_RET(__hazard_exclusive(session, ref, top));
+
+ /*
+ * Now the page is locked, remove it from the LRU eviction
+ * queue. We have to do this before freeing the page memory or
+ * otherwise touching the reference because eviction paths
+ * assume a non-NULL reference on the queue is pointing at
+ * valid memory.
+ */
+ __wt_evict_list_clear_page(session, ref);
+ }
+
+ /*
+ * Recurse through the page's subtree: this happens first because we
+ * have to write pages in depth-first order, otherwise we'll dirty
+ * pages after we've written them.
+ */
+ if (WT_PAGE_IS_INTERNAL(page))
+ WT_INTL_FOREACH_BEGIN(session, page, child) {
+ switch (child->state) {
+ case WT_REF_DISK: /* On-disk */
+ case WT_REF_DELETED: /* On-disk, deleted */
+ break;
+ case WT_REF_MEM: /* In-memory */
+ /*
+ * Tell our caller if there's a subtree so we
+ * know to do a full walk when discarding the
+ * page.
+ */
+ *istree = 1;
+ WT_RET(__rec_review(
+ session, child, exclusive, 0, istree));
+ break;
+ case WT_REF_LOCKED: /* Being evicted */
+ case WT_REF_READING: /* Being read */
+ case WT_REF_SPLIT: /* Being split */
+ return (EBUSY);
+ WT_ILLEGAL_VALUE(session);
+ }
+ } WT_INTL_FOREACH_END;
+
+ mod = page->modify;
+
+ /*
+ * If the tree was deepened, there's a requirement that newly created
+ * internal pages not be evicted until all threads are known to have
+ * exited the original page index array, because evicting an internal
+ * page discards its WT_REF array, and a thread traversing the original
+ * page index array might see an freed WT_REF. During the split we set
+ * a transaction value, once that's globally visible, we know we can
+ * evict the created page.
+ */
+ if (!exclusive && mod != NULL && WT_PAGE_IS_INTERNAL(page) &&
+ !__wt_txn_visible_all(session, mod->mod_split_txn))
+ return (EBUSY);
+
+ /*
+ * If the file is being checkpointed, we can't evict dirty pages:
+ * if we write a page and free the previous version of the page, that
+ * previous version might be referenced by an internal page already
+ * been written in the checkpoint, leaving the checkpoint inconsistent.
+ *
+ * Don't rely on new updates being skipped by the transaction used
+ * for transaction reads: (1) there are paths that dirty pages for
+ * artificial reasons; (2) internal pages aren't transactional; and
+ * (3) if an update was skipped during the checkpoint (leaving the page
+ * dirty), then rolled back, we could still successfully overwrite a
+ * page and corrupt the checkpoint.
+ *
+ * Further, we can't race with the checkpoint's reconciliation of
+ * an internal page as we evict a clean child from the page's subtree.
+ * This works in the usual way: eviction locks the page and then checks
+ * for existing hazard pointers, the checkpoint thread reconciling an
+ * internal page acquires hazard pointers on child pages it reads, and
+ * is blocked by the exclusive lock.
+ */
+ if (mod != NULL && btree->checkpointing &&
+ (__wt_page_is_modified(page) ||
+ F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
+ return (EBUSY);
+ }
+
+ /*
+ * Fail if any page in the top-level page's subtree won't be merged into
+ * its parent, the page that cannot be merged must be evicted first.
+ * The test is necessary but should not fire much: the eviction code is
+ * biased for leaf pages, an internal page shouldn't be selected for
+ * eviction until its children have been evicted.
+ *
+ * We have to write dirty pages to know their final state, a page marked
+ * empty may have had records added since reconciliation. Writing the
+ * page is expensive, do a cheap test first: if it doesn't seem likely a
+ * subtree page can be merged, quit.
+ */
+ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY)))
+ return (EBUSY);
+
+ /*
+ * If the page is dirty and can possibly change state, write it so we
+ * know the final state.
+ *
+ * If we have an exclusive lock (we're discarding the tree), assert
+ * there are no updates we cannot read.
+ *
+ * Otherwise, if the top-level page we're evicting is a leaf page, set
+ * the update-restore flag, so reconciliation will write blocks it can
+ * write and create a list of skipped updates for blocks it cannot
+ * write. This is how forced eviction of huge pages works: we take a
+ * big page and reconcile it into blocks, some of which we write and
+ * discard, the rest of which we re-create as smaller in-memory pages,
+ * (restoring the updates that stopped us from writing the block), and
+ * inserting the whole mess into the page's parent.
+ *
+ * Don't set the update-restore flag for internal pages, they don't
+ * have updates that can be saved and restored.
+ *
+ * Don't set the update-restore flag for small pages. (If a small
+ * page were selected by eviction and then modified, and we configure it
+ * for update-restore, we'll end up splitting one or two pages into the
+ * parent, which is a waste of effort. If we don't set update-restore,
+ * eviction will return EBUSY, which makes more sense, the page was just
+ * modified.)
+ *
+ * Don't set the update-restore flag for any page other than the
+ * top one; only the reconciled top page goes through the split path
+ * (and child pages are pages we expect to merge into the top page, they
+ * they are not expected to split).
+ */
+ if (__wt_page_is_modified(page)) {
+ flags = WT_EVICTING;
+ if (exclusive)
+ LF_SET(WT_SKIP_UPDATE_ERR);
+ else if (top && !WT_PAGE_IS_INTERNAL(page) &&
+ page->memory_footprint > 10 * btree->maxleafpage)
+ LF_SET(WT_SKIP_UPDATE_RESTORE);
+ WT_RET(__wt_rec_write(session, ref, NULL, flags));
+ WT_ASSERT(session,
+ !__wt_page_is_modified(page) ||
+ LF_ISSET(WT_SKIP_UPDATE_RESTORE));
+ } else {
+ /*
+ * If the page was ever modified, make sure all of the updates
+ * on the page are old enough they can be discarded from cache.
+ */
+ if (!exclusive && mod != NULL &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ return (EBUSY);
+ }
+
+ /*
+ * Repeat the test: fail if any page in the top-level page's subtree
+ * won't be merged into its parent.
+ */
+ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY)))
+ return (EBUSY);
+
+ return (0);
+}
+
+/*
+ * __rec_excl_clear --
+ * Discard exclusive access and return a page's subtree to availability.
+ */
+static void
+__rec_excl_clear(WT_SESSION_IMPL *session)
+{
+ WT_REF *ref;
+ uint32_t i;
+
+ for (i = 0; i < session->excl_next; ++i) {
+ if ((ref = session->excl[i]) == NULL)
+ break;
+ WT_ASSERT(session,
+ ref->state == WT_REF_LOCKED && ref->page != NULL);
+ ref->state = WT_REF_MEM;
+ }
+}
+
+/*
+ * __hazard_exclusive --
+ * Request exclusive access to a page.
+ */
+static int
+__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
+{
+ /*
+ * Make sure there is space to track exclusive access so we can unlock
+ * to clean up.
+ */
+ WT_RET(__wt_realloc_def(session, &session->excl_allocated,
+ session->excl_next + 1, &session->excl));
+
+ /*
+ * Request exclusive access to the page. The top-level page should
+ * already be in the locked state, lock child pages in memory.
+ * If another thread already has this page, give up.
+ */
+ if (!top && !WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED))
+ return (EBUSY); /* We couldn't change the state. */
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+
+ session->excl[session->excl_next++] = ref;
+
+ /* Check for a matching hazard pointer. */
+ if (__wt_page_hazard_check(session, ref->page) == NULL)
+ return (0);
+
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard);
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard);
+
+ WT_RET(__wt_verbose(session, WT_VERB_EVICT,
+ "page %p hazard request failed", ref->page));
+ return (EBUSY);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_split.c b/src/third_party/wiredtiger/src/btree/rec_split.c
new file mode 100644
index 00000000000..babec2cc295
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_split.c
@@ -0,0 +1,1121 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Tuning; global variables to allow the binary to be patched, we don't yet have
+ * any real understanding of what might be useful to surface to applications.
+ */
+static u_int __split_deepen_max_internal_image = 100;
+static u_int __split_deepen_min_child = 10;
+static u_int __split_deepen_per_child = 100;
+static u_int __split_deepen_split_child = 100;
+
+/*
+ * Track allocation increments, matching the cache calculations, which add an
+ * estimate of allocation overhead to every object.
+ */
+#define WT_MEMSIZE_ADD(total, len) do { \
+ total += (len) + WT_ALLOC_OVERHEAD; \
+} while (0)
+#define WT_MEMSIZE_TRANSFER(from_decr, to_incr, len) do { \
+ WT_MEMSIZE_ADD(from_decr, len); \
+ WT_MEMSIZE_ADD(to_incr, len); \
+} while (0)
+
+/*
+ * __split_oldest_gen --
+ * Calculate the oldest active split generation.
+ */
+static uint64_t
+__split_oldest_gen(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *s;
+ uint64_t gen, oldest;
+ u_int i, session_cnt;
+
+ conn = S2C(session);
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1;
+ i < session_cnt;
+ i++, s++)
+ if (((gen = s->split_gen) != 0) && gen < oldest)
+ oldest = gen;
+
+ return (oldest);
+}
+
+/*
+ * __split_stash_add --
+ * Add a new entry into the session's split stash list.
+ */
+static int
+__split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len)
+{
+ WT_SPLIT_STASH *stash;
+
+ WT_ASSERT(session, p != NULL);
+
+ /* Grow the list as necessary. */
+ WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
+ session->split_stash_cnt + 1, &session->split_stash));
+
+ stash = session->split_stash + session->split_stash_cnt++;
+ stash->split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+ stash->p = p;
+ stash->len = len;
+
+ WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len);
+ WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects);
+
+ /* See if we can free any previous entries. */
+ if (session->split_stash_cnt > 1)
+ __wt_split_stash_discard(session);
+
+ return (0);
+}
+
+/*
+ * __wt_split_stash_discard --
+ * Discard any memory from a session's split stash that we can.
+ */
+void
+__wt_split_stash_discard(WT_SESSION_IMPL *session)
+{
+ WT_SPLIT_STASH *stash;
+ uint64_t oldest;
+ size_t i;
+
+ /* Get the oldest split generation. */
+ oldest = __split_oldest_gen(session);
+
+ for (i = 0, stash = session->split_stash;
+ i < session->split_stash_cnt;
+ ++i, ++stash) {
+ if (stash->p == NULL)
+ continue;
+ else if (stash->split_gen >= oldest)
+ break;
+ /*
+ * It's a bad thing if another thread is in this memory after
+ * we free it, make sure nothing good happens to that thread.
+ */
+ WT_STAT_FAST_CONN_ATOMIC_DECRV(
+ session, rec_split_stashed_bytes, stash->len);
+ WT_STAT_FAST_CONN_ATOMIC_DECR(
+ session, rec_split_stashed_objects);
+ __wt_overwrite_and_free_len(session, stash->p, stash->len);
+ }
+
+ /*
+ * If there are enough free slots at the beginning of the list, shuffle
+ * everything down.
+ */
+ if (i > 100 || i == session->split_stash_cnt)
+ if ((session->split_stash_cnt -= i) > 0)
+ memmove(session->split_stash, stash,
+ session->split_stash_cnt * sizeof(*stash));
+}
+
+/*
+ * __wt_split_stash_discard_all --
+ * Discard all memory from a session's split stash.
+ */
+void
+__wt_split_stash_discard_all(
+ WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session)
+{
+ WT_SPLIT_STASH *stash;
+ size_t i;
+
+ /*
+ * This function is called during WT_CONNECTION.close to discard any
+ * memory that remains. For that reason, we take two WT_SESSION_IMPL
+ * arguments: session_safe is still linked to the WT_CONNECTION and
+ * can be safely used for calls to other WiredTiger functions, while
+ * session is the WT_SESSION_IMPL we're cleaning up.
+ */
+ for (i = 0, stash = session->split_stash;
+ i < session->split_stash_cnt;
+ ++i, ++stash)
+ if (stash->p != NULL)
+ __wt_free(session_safe, stash->p);
+
+ __wt_free(session_safe, session->split_stash);
+ session->split_stash_cnt = session->split_stash_alloc = 0;
+}
+
+/*
+ * __split_safe_free --
+ * Free a buffer if we can be sure no thread is accessing it, or schedule
+ * it to be freed otherwise.
+ */
+static int
+__split_safe_free(WT_SESSION_IMPL *session, int exclusive, void *p, size_t s)
+{
+ /*
+ * We have swapped something in a page: if we don't have exclusive
+ * access, check whether there are other threads in the same tree.
+ */
+ if (!exclusive &&
+ __split_oldest_gen(session) == S2C(session)->split_gen + 1)
+ exclusive = 1;
+
+ if (exclusive) {
+ __wt_free(session, p);
+ return (0);
+ }
+
+ return (__split_stash_add(session, p, s));
+}
+
+/*
+ * __split_should_deepen --
+ * Return if we should deepen the tree.
+ */
+static int
+__split_should_deepen(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_PAGE_INDEX *pindex;
+
+ /*
+ * Splits are based on either the number of child pages that will be
+ * created by the split (splitting an internal page that will be slow
+ * to search), or by the memory footprint of the parent page (avoiding
+ * an internal page that will eat up all of the cache and put eviction
+ * pressure on the system).
+ */
+ pindex = WT_INTL_INDEX_COPY(page);
+
+ /*
+ * Deepen the tree if the page's memory footprint is larger than the
+ * maximum size for a page in memory. We need an absolute minimum
+ * number of entries in order to split the page: if there is a single
+ * huge key, splitting won't help.
+ */
+ if (page->memory_footprint > S2BT(session)->maxmempage &&
+ pindex->entries >= __split_deepen_min_child)
+ return (1);
+
+ /*
+ * Deepen the tree if the page's memory footprint is at least N
+ * times the maximum internal page size chunk in the backing file and
+ * the split will result in at least N children in the newly created
+ * intermediate layer.
+ */
+ if (page->memory_footprint >
+ __split_deepen_max_internal_image * S2BT(session)->maxintlpage &&
+ pindex->entries >=
+ (__split_deepen_per_child * __split_deepen_split_child))
+ return (1);
+
+ return (0);
+}
+
+/*
+ * __split_ovfl_key_cleanup --
+ * Handle cleanup for on-page row-store overflow keys.
+ */
+static int
+__split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref)
+{
+ WT_CELL *cell;
+ WT_CELL_UNPACK kpack;
+ WT_IKEY *ikey;
+ uint32_t cell_offset;
+
+ /*
+ * A key being discarded (page split) or moved to a different page (page
+ * deepening) may be an on-page overflow key. Clear any reference to an
+ * underlying disk image, and, if the key hasn't been deleted, delete it
+ * along with any backing blocks.
+ */
+ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL)
+ return (0);
+ if ((cell_offset = ikey->cell_offset) == 0)
+ return (0);
+
+ /* Leak blocks rather than try this twice. */
+ ikey->cell_offset = 0;
+
+ cell = WT_PAGE_REF_OFFSET(page, cell_offset);
+ __wt_cell_unpack(cell, &kpack);
+ if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM)
+ WT_RET(__wt_ovfl_discard(session, cell));
+
+ return (0);
+}
+
+/*
+ * __split_ref_instantiate --
+ * Instantiate key/address pairs in memory in service of a split.
+ */
+static int
+__split_ref_instantiate(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
+{
+ WT_ADDR *addr;
+ WT_CELL_UNPACK unpack;
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ size_t size;
+ void *key;
+
+ /*
+ * Instantiate row-store keys, and column- and row-store addresses in
+ * the WT_REF structures referenced by a page that's being split (and
+ * deepening the tree). The WT_REF structures aren't moving, but the
+ * index references are moving from the page we're splitting to a set
+ * of child pages, and so we can no longer reference the block image
+ * that remains with the page being split.
+ *
+ * Track how much memory the parent is losing and the child gaining.
+ *
+ * No locking is required to update the WT_REF structure because we're
+ * the only thread splitting the parent page, and there's no way for
+ * readers to race with our updates of single pointers. The changes
+ * have to be written before the page goes away, of course, our caller
+ * owns that problem.
+ *
+ * Row-store keys, first.
+ */
+ if (page->type == WT_PAGE_ROW_INT) {
+ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
+ __wt_ref_key(page, ref, &key, &size);
+ WT_RET(__wt_row_ikey(session, 0, key, size, &ikey));
+ ref->key.ikey = ikey;
+ } else {
+ WT_RET(__split_ovfl_key_cleanup(session, page, ref));
+ WT_MEMSIZE_ADD(*parent_decrp,
+ sizeof(WT_IKEY) + ikey->size);
+ }
+ WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_IKEY) + ikey->size);
+ }
+
+ /*
+ * If there's no address (the page has never been written), or the
+ * address has been instantiated, there's no work to do. Otherwise,
+ * get the address from the on-page cell.
+ */
+ if ((addr = ref->addr) == NULL)
+ return (0);
+ if (__wt_off_page(page, addr))
+ WT_MEMSIZE_TRANSFER(*parent_decrp, *child_incrp,
+ sizeof(WT_ADDR) + addr->size);
+ else {
+ __wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
+ WT_RET(__wt_calloc_def(session, 1, &addr));
+ if ((ret = __wt_strndup(
+ session, unpack.data, unpack.size, &addr->addr)) != 0) {
+ __wt_free(session, addr);
+ return (ret);
+ }
+ addr->size = (uint8_t)unpack.size;
+ addr->type =
+ unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
+ ref->addr = addr;
+ WT_MEMSIZE_ADD(*child_incrp, sizeof(WT_ADDR) + addr->size);
+ }
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __split_verify_intl_key_order --
+ * Verify the key order on an internal page after a split, diagnostic only.
+ */
+static void
+__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_ITEM *next, _next, *last, _last, *tmp;
+ WT_REF *ref;
+ uint64_t recno;
+ int cmp, first;
+
+ btree = S2BT(session);
+
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ recno = 0;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ WT_ASSERT(session, ref->key.recno > recno);
+ recno = ref->key.recno;
+ } WT_INTL_FOREACH_END;
+ break;
+ case WT_PAGE_ROW_INT:
+ next = &_next;
+ WT_CLEAR(_next);
+ last = &_last;
+ WT_CLEAR(_last);
+
+ first = 1;
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ __wt_ref_key(page, ref, &next->data, &next->size);
+ if (last->size == 0) {
+ if (first)
+ first = 0;
+ else {
+ WT_ASSERT(session, __wt_compare(
+ session, btree->collator, last,
+ next, &cmp) == 0);
+ WT_ASSERT(session, cmp < 0);
+ }
+ }
+ tmp = last;
+ last = next;
+ next = tmp;
+ } WT_INTL_FOREACH_END;
+ break;
+ }
+}
+#endif
+
+/*
+ * __split_deepen --
+ * Split an internal page in-memory, deepening the tree.
+ */
+static int
+__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
+{
+ WT_DECL_RET;
+ WT_PAGE *child;
+ WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
+ WT_REF **alloc_refp;
+ WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
+ size_t child_incr, parent_decr, parent_incr, size;
+ uint32_t children, chunk, i, j, remain, slots;
+ int panic;
+ void *p;
+
+ alloc_index = NULL;
+ parent_incr = parent_decr = 0;
+ panic = 0;
+
+ pindex = WT_INTL_INDEX_COPY(parent);
+
+ /*
+ * Create N children, unless we are dealing with a large page without
+ * many entries, in which case split into the minimum number of pages.
+ */
+ children = WT_MAX(pindex->entries / __split_deepen_per_child,
+ __split_deepen_min_child);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
+ parent, pindex->entries, children));
+
+ /*
+ * If the workload is prepending/appending to the tree, we could deepen
+ * without bound. Don't let that happen, keep the first/last pages of
+ * the tree at their current level.
+ *
+ * XXX
+ * To improve this, we could track which pages were last merged into
+ * this page by eviction, and leave those pages alone, to prevent any
+ * sustained insert into the tree from deepening a single location.
+ */
+#undef SPLIT_CORRECT_1
+#define SPLIT_CORRECT_1 1 /* First page correction */
+#undef SPLIT_CORRECT_2
+#define SPLIT_CORRECT_2 2 /* First/last page correction */
+
+ /*
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
+ * the first/last slots of the allocated WT_PAGE_INDEX to point to the
+ * first/last pages we're keeping at the current level, and the rest of
+ * the slots to point to new WT_REF objects.
+ */
+ size = sizeof(WT_PAGE_INDEX) +
+ (children + SPLIT_CORRECT_2) * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ WT_MEMSIZE_ADD(parent_incr, size);
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = children + SPLIT_CORRECT_2;
+ alloc_index->index[0] = pindex->index[0];
+ alloc_index->index[alloc_index->entries - 1] =
+ pindex->index[pindex->entries - 1];
+ for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+ i = 0; i < children; ++alloc_refp, ++i) {
+ WT_ERR(__wt_calloc_def(session, 1, alloc_refp));
+ WT_MEMSIZE_ADD(parent_incr, sizeof(WT_REF));
+ }
+
+ /* Allocate child pages, and connect them into the new page index. */
+ chunk = (pindex->entries - SPLIT_CORRECT_2) / children;
+ remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1);
+ for (parent_refp = pindex->index + SPLIT_CORRECT_1,
+ alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+ i = 0; i < children; ++i) {
+ slots = i == children - 1 ? remain : chunk;
+ WT_ERR(__wt_page_alloc(
+ session, parent->type, 0, slots, 0, &child));
+
+ /*
+ * Initialize the parent page's child reference; we need a copy
+ * of the page's key.
+ */
+ ref = *alloc_refp++;
+ ref->home = parent;
+ ref->page = child;
+ ref->addr = NULL;
+ if (parent->type == WT_PAGE_ROW_INT) {
+ __wt_ref_key(parent, *parent_refp, &p, &size);
+ WT_ERR(
+ __wt_row_ikey(session, 0, p, size, &ref->key.ikey));
+ WT_MEMSIZE_ADD(parent_incr, sizeof(WT_IKEY) + size);
+ } else
+ ref->key.recno = (*parent_refp)->key.recno;
+ ref->state = WT_REF_MEM;
+
+ /* Initialize the child page. */
+ if (parent->type == WT_PAGE_COL_INT)
+ child->pg_intl_recno = (*parent_refp)->key.recno;
+ child->pg_intl_parent_ref = ref;
+
+ /* Mark it dirty. */
+ WT_ERR(__wt_page_modify_init(session, child));
+ __wt_page_only_modify_set(session, child);
+
+ /*
+ * Once the split goes live, the newly created internal pages
+ * might be evicted and their WT_REF structures freed. If those
+ * pages are evicted before threads exit the previous page index
+ * array, a thread might see a freed WT_REF. Set the eviction
+ * transaction requirement for the newly created internal pages.
+ */
+ child->modify->mod_split_txn = __wt_txn_new_id(session);
+
+ /*
+ * The newly allocated child's page index references the same
+ * structures as the parent. (We cannot move WT_REF structures,
+ * threads may be underneath us right now changing the structure
+ * state.) However, if the WT_REF structures reference on-page
+ * information, we have to fix that, because the disk image for
+ * the page that has an page index entry for the WT_REF is about
+ * to change.
+ */
+ child_incr = 0;
+ child_pindex = WT_INTL_INDEX_COPY(child);
+ for (child_refp = child_pindex->index, j = 0; j < slots; ++j) {
+ WT_ERR(__split_ref_instantiate(session,
+ parent, *parent_refp, &parent_decr, &child_incr));
+ *child_refp++ = *parent_refp++;
+
+ WT_MEMSIZE_TRANSFER(
+ parent_decr, child_incr, sizeof(WT_REF));
+ }
+ __wt_cache_page_inmem_incr(session, child, child_incr);
+ }
+ WT_ASSERT(session, alloc_refp -
+ alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1);
+ WT_ASSERT(session,
+ parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1);
+
+ /*
+ * Update the parent's index; this is the update which splits the page,
+ * making the change visible to threads descending the tree. From now
+ * on, we're committed to the split. If any subsequent work fails, we
+ * have to panic because we potentially have threads of control using
+ * the new page index we just swapped in.
+ *
+ * A note on error handling: until this point, there's no problem with
+ * unwinding on error. We allocated a new page index, a new set of
+ * WT_REFs and a new set of child pages -- if an error occurred, the
+ * parent remained unchanged, although it may have an incorrect memory
+ * footprint. From now on we've modified the parent page, attention
+ * needs to be paid.
+ */
+ WT_INTL_INDEX_SET(parent, alloc_index);
+ panic = 1;
+
+#ifdef HAVE_DIAGNOSTIC
+ __split_verify_intl_key_order(session, parent);
+#endif
+
+ /*
+ * The moved reference structures now reference the wrong parent page,
+ * and we have to fix that up. The problem is revealed when a thread
+ * of control searches for a page's reference structure slot, and fails
+ * to find it because the page it's searching no longer references it.
+ * When that failure happens, the thread waits for the reference's home
+ * page to be updated, which we do here: walk the children and fix them
+ * up.
+ *
+ * We're not acquiring hazard pointers on these pages, they cannot be
+ * evicted because of the eviction transaction value set above.
+ */
+ for (parent_refp = alloc_index->index,
+ i = alloc_index->entries; i > 0; ++parent_refp, --i) {
+ parent_ref = *parent_refp;
+ WT_ASSERT(session, parent_ref->home == parent);
+ if (parent_ref->state != WT_REF_MEM)
+ continue;
+
+ /*
+ * We left the first/last children of the parent at the current
+ * level to avoid bad split patterns, they might be leaf pages;
+ * check the page type before we continue.
+ */
+ child = parent_ref->page;
+ if (!WT_PAGE_IS_INTERNAL(child))
+ continue;
+#ifdef HAVE_DIAGNOSTIC
+ __split_verify_intl_key_order(session, child);
+#endif
+ WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
+ /*
+ * The page's parent reference may not be wrong, as we
+ * opened up access from the top of the tree already,
+ * pages may have been read in since then. Check and
+ * only update pages that reference the original page,
+ * they must be wrong.
+ */
+ if (child_ref->home == parent) {
+ child_ref->home = child;
+ child_ref->ref_hint = 0;
+ }
+ } WT_INTL_FOREACH_END;
+ }
+
+ /*
+ * Push out the changes: not required for correctness, but don't let
+ * threads spin on incorrect page references longer than necessary.
+ */
+ WT_FULL_BARRIER();
+ alloc_index = NULL;
+
+ /*
+ * We can't free the previous parent's index, there may be threads using
+ * it. Add to the session's discard list, to be freed once we know no
+ * threads can still be using it.
+ *
+ * This change requires care with error handling: we have already
+ * updated the page with a new index. Even if stashing the old value
+ * fails, we don't roll back that change, because threads may already
+ * be using the new index.
+ */
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_MEMSIZE_ADD(parent_decr, size);
+ WT_ERR(__split_safe_free(session, 0, pindex, size));
+
+ /*
+ * Adjust the parent's memory footprint. This may look odd, but we
+ * have already taken the allocation overhead into account, and an
+ * increment followed by a decrement will cancel out the normal
+ * adjustment.
+ */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+
+ if (0) {
+err: __wt_free_ref_index(session, parent, alloc_index, 1);
+
+ /*
+ * If panic is set, we saw an error after opening up the tree
+ * to descent through the parent page's new index. There is
+ * nothing we can do, the tree is inconsistent and there are
+ * threads potentially active in both versions of the tree.
+ */
+ if (panic)
+ ret = __wt_panic(session);
+ }
+ return (ret);
+}
+
+/*
+ * __split_inmem_build --
+ * Instantiate a page in a multi-block set, when an update couldn't be
+ * written.
+ */
+static int
+__split_inmem_build(
+ WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi)
+{
+ WT_CURSOR_BTREE cbt;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ WT_UPD_SKIPPED *skip;
+ uint64_t recno;
+ uint32_t i, slot;
+
+ WT_CLEAR(cbt);
+ cbt.iface.session = &session->iface;
+ cbt.btree = S2BT(session);
+
+ /*
+ * We can find unresolved updates when attempting to evict a page, which
+ * can't be written. This code re-creates the in-memory page and applies
+ * the unresolved updates to that page.
+ *
+ * Clear the disk image and link the page into the passed-in WT_REF to
+ * simplify error handling: our caller will not discard the disk image
+ * when discarding the original page, and our caller will discard the
+ * allocated page on error, when discarding the allocated WT_REF.
+ */
+ WT_RET(__wt_page_inmem(
+ session, ref, multi->skip_dsk, WT_PAGE_DISK_ALLOC, &page));
+ multi->skip_dsk = NULL;
+
+ if (orig->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /* Re-create each modification we couldn't write. */
+ for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip)
+ switch (orig->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ /* Build a key. */
+ upd = skip->ins->upd;
+ skip->ins->upd = NULL;
+ recno = WT_INSERT_RECNO(skip->ins);
+
+ /* Search the page. */
+ WT_ERR(__wt_col_search(session, recno, ref, &cbt));
+
+ /* Apply the modification. */
+ WT_ERR(__wt_col_modify(
+ session, &cbt, recno, NULL, upd, 0));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /* Build a key. */
+ if (skip->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, skip->rip);
+ upd = orig->pg_row_upd[slot];
+ orig->pg_row_upd[slot] = NULL;
+
+ WT_ERR(__wt_row_leaf_key(
+ session, orig, skip->rip, key, 0));
+ } else {
+ upd = skip->ins->upd;
+ skip->ins->upd = NULL;
+
+ key->data = WT_INSERT_KEY(skip->ins);
+ key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ }
+
+ /* Search the page. */
+ WT_ERR(__wt_row_search(session, key, ref, &cbt, 1));
+
+ /* Apply the modification. */
+ WT_ERR(
+ __wt_row_modify(session, &cbt, key, NULL, upd, 0));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * We modified the page above, which will have set the first dirty
+ * transaction to the last transaction current running. However, the
+ * updates we installed may be older than that. Take the oldest active
+ * transaction ID to make sure these updates are not skipped by a
+ * checkpoint.
+ */
+ page->modify->first_dirty_txn = S2C(session)->txn_global.oldest_id;
+
+err: __wt_scr_free(&key);
+ /* Free any resources that may have been cached in the cursor. */
+ WT_TRET(__wt_btcur_close(&cbt));
+ return (ret);
+}
+
+/*
+ * __wt_multi_to_ref --
+ * Move a multi-block list into an array of WT_REF structures.
+ */
+int
+__wt_multi_to_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp)
+{
+ WT_ADDR *addr;
+ WT_IKEY *ikey;
+ WT_REF *ref;
+ size_t incr;
+
+ addr = NULL;
+ incr = 0;
+
+ /* In some cases, the underlying WT_REF has not yet been allocated. */
+ if (*refp == NULL) {
+ WT_RET(__wt_calloc_def(session, 1, refp));
+ WT_MEMSIZE_ADD(incr, sizeof(WT_REF));
+ }
+ ref = *refp;
+
+ /*
+ * Any parent reference must be filled in by our caller; the primary
+ * use of this function is when splitting into a parent page, and we
+ * aren't holding any locks here that would allow us to know which
+ * parent we'll eventually split into, if the tree is simultaneously
+ * being deepened.
+ */
+ ref->home = NULL;
+
+ if (multi->skip == NULL) {
+ /*
+ * Copy the address: we could simply take the buffer, but that
+ * would complicate error handling, freeing the reference array
+ * would have to avoid freeing the memory, and it's not worth
+ * the confusion.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &addr));
+ WT_MEMSIZE_ADD(incr, sizeof(WT_ADDR));
+ ref->addr = addr;
+ addr->size = multi->addr.size;
+ addr->type = multi->addr.type;
+ WT_RET(__wt_strndup(session,
+ multi->addr.addr, addr->size, &addr->addr));
+ /* Need a cast to avoid an implicit conversion warning. */
+ WT_MEMSIZE_ADD(incr, addr->size);
+ } else
+ WT_RET(__split_inmem_build(session, page, ref, multi));
+
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ ikey = multi->key.ikey;
+ WT_RET(__wt_row_ikey(session, 0,
+ WT_IKEY_DATA(ikey), ikey->size, &ref->key.ikey));
+ WT_MEMSIZE_ADD(incr, sizeof(WT_IKEY) + ikey->size);
+ break;
+ default:
+ ref->key.recno = multi->key.recno;
+ break;
+ }
+
+ ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM;
+
+ /*
+ * If our caller wants to track the memory allocations, we have a return
+ * reference.
+ */
+ if (incrp != NULL)
+ *incrp += incr;
+ return (0);
+}
+
+/*
+ * __split_evict_multi --
+ * Resolve a multi-page split, inserting new information into the parent.
+ */
+static int
+__split_evict_multi(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_PAGE *parent, *child;
+ WT_PAGE_INDEX *alloc_index, *pindex;
+ WT_PAGE_MODIFY *mod;
+ WT_REF **alloc_refp, *parent_ref, ref_copy, **ref_tmp;
+ size_t parent_decr, parent_incr, size;
+ uint32_t i, j, parent_entries, result_entries, split_entries;
+ int complete, hazard, locked;
+
+ parent = NULL; /* -Wconditional-uninitialized */
+ alloc_index = NULL;
+ parent_ref = NULL;
+ ref_tmp = NULL;
+ parent_decr = parent_incr = 0;
+ complete = hazard = locked = 0;
+
+ child = ref->page;
+ mod = child->modify;
+
+ /*
+ * Convert the split page's multiblock reconciliation information into
+ * an array of page reference structures.
+ */
+ split_entries = mod->mod_multi_entries;
+ WT_RET(__wt_calloc_def(session, split_entries, &ref_tmp));
+ for (i = 0; i < split_entries; ++i)
+ WT_ERR(__wt_multi_to_ref(session,
+ child, &mod->mod_multi[i], &ref_tmp[i], &parent_incr));
+
+ /*
+ * Get a page-level lock on the parent to single-thread splits into the
+ * page because we need to single-thread sizing/growing the page index.
+ * It's OK to queue up multiple splits as the child pages split, but the
+ * actual split into the parent has to be serialized. Note we allocate
+ * memory inside of the lock and may want to invest effort in making the
+ * locked period shorter.
+ *
+ * We could race with another thread deepening our parent. To deal
+ * with that, read the parent pointer each time we try to lock it, and
+ * check that it's still correct after it is locked.
+ */
+ for (;;) {
+ parent = ref->home;
+ F_CAS_ATOMIC(parent, WT_PAGE_SPLITTING, ret);
+ if (ret == 0) {
+ if (parent == ref->home)
+ break;
+ F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+ continue;
+ }
+ __wt_yield();
+ }
+ locked = 1;
+
+ /*
+ * We have exclusive access to split the parent, and at this point, the
+ * child prevents the parent from being evicted. However, once we
+ * update the parent's index, it will no longer refer to the child, and
+ * could conceivably be evicted. Get a hazard pointer on the parent
+ * now, so that we can safely access it after updating the index.
+ */
+ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
+ WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
+ hazard = 1;
+ }
+
+ pindex = WT_INTL_INDEX_COPY(parent);
+ parent_entries = pindex->entries;
+ result_entries = (parent_entries - 1) + split_entries;
+
+ /*
+ * Allocate and initialize a new page index array for the parent, then
+ * copy references from the original index array, plus references from
+ * the newly created split array, into place.
+ */
+ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
+ WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
+ WT_MEMSIZE_ADD(parent_incr, size);
+ alloc_index->index = (WT_REF **)(alloc_index + 1);
+ alloc_index->entries = result_entries;
+ for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i)
+ if (pindex->index[i] == ref)
+ for (j = 0; j < split_entries; ++j) {
+ ref_tmp[j]->home = parent;
+ *alloc_refp++ = ref_tmp[j];
+
+ /*
+ * Clear the split reference as it moves to the
+ * allocated page index, so it never appears on
+ * both after an error.
+ */
+ ref_tmp[j] = NULL;
+ }
+ else
+ *alloc_refp++ = pindex->index[i];
+ __wt_free(session, ref_tmp);
+
+ /*
+ * Update the parent page's index: this update makes the split visible
+ * to threads descending the tree.
+ */
+ WT_INTL_INDEX_SET(parent, alloc_index);
+ alloc_index = NULL;
+
+#ifdef HAVE_DIAGNOSTIC
+ __split_verify_intl_key_order(session, parent);
+#endif
+
+ /*
+ * Reset the page's original WT_REF field to split. Threads cursoring
+ * through the tree were blocked because that WT_REF state was set to
+ * locked. This update changes the locked state to split, unblocking
+ * those threads and causing them to re-calculate their position based
+ * on the updated parent page's index.
+ */
+ WT_PUBLISH(ref->state, WT_REF_SPLIT);
+
+ /*
+ * A note on error handling: failures before we swapped the new page
+ * index into the parent can be resolved by simply freeing allocated
+ * memory because the original page is unchanged, we can continue to
+ * use it and we have not yet modified the parent. (See below for an
+ * exception, we cannot discard pages referencing unresolved changes.)
+ * Failures after we swap the new page index into the parent are also
+ * relatively benign because the split is OK and complete and the page
+ * is reset so it will be discarded by eviction. For that reason, we
+ * mostly ignore further errors unless there's a panic.
+ */
+ complete = 1;
+
+ /*
+ * The previous parent page's key for this child page may have been an
+ * on-page overflow key. In that case, if the key hasn't been deleted,
+ * delete it now, including its backing blocks. We are exchanging the
+ * WT_REF that referenced it for the split page WT_REFs and their keys,
+ * and there's no longer any reference to it. Done after completing the
+ * split (if we failed, we'd leak the underlying blocks, but the parent
+ * page would be unaffected).
+ */
+ if (parent->type == WT_PAGE_ROW_INT)
+ WT_TRET(__split_ovfl_key_cleanup(session, parent, ref));
+
+ /*
+ * We can't free the previous page index, or the page's original WT_REF
+ * structure and instantiated key, there may be threads using them. Add
+ * them to the session discard list, to be freed once we know it's safe.
+ */
+ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
+ WT_TRET(__split_safe_free(session, exclusive, pindex, size));
+ WT_MEMSIZE_ADD(parent_decr, size);
+ if (parent->type == WT_PAGE_ROW_INT &&
+ (ikey = __wt_ref_key_instantiated(ref)) != NULL) {
+ size = sizeof(WT_IKEY) + ikey->size;
+ WT_TRET(__split_safe_free(session, exclusive, ikey, size));
+ WT_MEMSIZE_ADD(parent_decr, size);
+ }
+ /*
+ * Take a copy of the ref in case we can free it immediately: we still
+ * need to discard the page.
+ */
+ ref_copy = *ref;
+ WT_TRET(__split_safe_free(session, exclusive, ref, sizeof(WT_REF)));
+ WT_MEMSIZE_ADD(parent_decr, sizeof(WT_REF));
+
+ /*
+ * Adjust the parent's memory footprint. This may look odd, but we
+ * have already taken the allocation overhead into account, and an
+ * increment followed by a decrement will cancel out the normal
+ * adjustment.
+ */
+ __wt_cache_page_inmem_incr(session, parent, parent_incr);
+ __wt_cache_page_inmem_decr(session, parent, parent_decr);
+
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %s split into parent %p %" PRIu32 " -> %" PRIu32
+ " (%" PRIu32 ")",
+ child, __wt_page_type_string(child->type), parent, parent_entries,
+ result_entries, result_entries - parent_entries));
+
+ /*
+ * Simple page splits trickle up the tree, that is, as leaf pages grow
+ * large enough and are evicted, they'll split into their parent. And,
+ * as that parent grows large enough and is evicted, it will split into
+ * its parent and so on. When the page split wave reaches the root,
+ * the tree will permanently deepen as multiple root pages are written.
+ * However, this only helps if first, the pages are evicted (and
+ * we resist evicting internal pages for obvious reasons), and second,
+ * if the tree is closed and re-opened from a disk image, which may be
+ * a rare event.
+ * To avoid the case of internal pages becoming too large when they
+ * aren't being evicted, check internal pages each time a leaf page is
+ * split into them. If it's big enough, deepen the tree at that point.
+ * Do the check here because we've just grown the parent page and
+ * are holding it locked.
+ */
+ if (ret == 0 && !exclusive && __split_should_deepen(session, parent))
+ ret = __split_deepen(session, parent);
+
+err: if (locked)
+ F_CLR_ATOMIC(parent, WT_PAGE_SPLITTING);
+
+ if (hazard)
+ WT_TRET(__wt_hazard_clear(session, parent));
+
+ /*
+ * Discard the child; test for split completion instead of errors, there
+ * might be a relatively innocuous error, and if we split the parent, we
+ * want to discard the child.
+ */
+ if (complete) {
+ /*
+ * Pages with unresolved changes are not marked clean during
+ * reconciliation, do it now.
+ */
+ if (__wt_page_is_modified(child)) {
+ mod->write_gen = 0;
+ __wt_cache_dirty_decr(session, child);
+ }
+ __wt_ref_out(session, &ref_copy);
+ }
+
+ /*
+ * A note on error handling: in the case of evicting a page that has
+ * unresolved changes, we just instantiated some in-memory pages that
+ * reflect those unresolved changes. The problem is those pages
+ * reference the same WT_UPDATE chains as the page we're splitting,
+ * that is, we simply copied references into the new pages. If the
+ * split fails, the original page is fine, but discarding the created
+ * page would free those update chains, and that's wrong. There isn't
+ * an easy solution, there's a lot of small memory allocations in some
+ * common code paths, and unwinding those changes will be difficult.
+ * For now, leak the memory by not discarding the instantiated pages.
+ */
+ __wt_free_ref_index(session, NULL, alloc_index, 0);
+ if (ref_tmp != NULL) {
+ for (i = 0; i < split_entries; ++i)
+ __wt_free_ref(session, child, ref_tmp[i], 0);
+ __wt_free(session, ref_tmp);
+ }
+
+ /*
+ * A note on error handling: if we completed the split, return success,
+ * nothing really bad can have happened.
+ */
+ return (ret == WT_PANIC || !complete ? ret : 0);
+}
+
+/*
+ * __split_evict_single --
+ * Resolve a single page split, replacing a page with a new version.
+ */
+static int
+__split_evict_single(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_REF new;
+
+ page = ref->page;
+ mod = page->modify;
+
+ /* Build the new page. */
+ memset(&new, 0, sizeof(new));
+ WT_RET(__split_inmem_build(session, page, &new, &mod->mod_multi[0]));
+
+ /*
+ * Discard the original page. Pages with unresolved changes are not
+ * marked clean during reconciliation, do it now.
+ */
+ mod->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ __wt_page_out(session, &page);
+
+ /* Swap the new page into place. */
+ ref->page = new.page;
+ WT_PUBLISH(ref->state, WT_REF_MEM);
+
+ return (0);
+}
+
+/*
+ * __wt_split_evict --
+ * Resolve a page split.
+ */
+int
+__wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
+{
+ uint32_t split_entries;
+
+ /*
+ * There are two cases entering this code. First, an in-memory page that
+ * got too large, we forcibly evicted it, and there wasn't anything to
+ * write. (Imagine two threads updating a small set keys on a leaf page.
+ * The page is too large so we try to evict it, but after reconciliation
+ * there's only a small amount of data (so it's a single page we can't
+ * split), and because there are two threads, there's some data we can't
+ * write (so we can't evict it). In that case, we take advantage of the
+ * fact we have exclusive access to the page and rewrite it in memory.)
+ *
+ * Second, a real split where we reconciled a page and it turned into a
+ * lot of pages.
+ */
+ split_entries = ref->page->modify->mod_multi_entries;
+ return (split_entries == 1 ?
+ __split_evict_single(session, ref) :
+ __split_evict_multi(session, ref, exclusive));
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_track.c b/src/third_party/wiredtiger/src/btree/rec_track.c
new file mode 100644
index 00000000000..92282393a23
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_track.c
@@ -0,0 +1,904 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Estimated memory cost for a structure on the overflow lists, the size of
+ * the structure plus two pointers (assume the average skip list depth is 2).
+ */
+#define WT_OVFL_SIZE(s) \
+ (sizeof(s) + 2 * sizeof(void *))
+
+/*
+ * __ovfl_track_init --
+ * Initialize the overflow tracking structure.
+ */
+static int
+__ovfl_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ return (__wt_calloc_def(session, 1, &page->modify->ovfl_track));
+}
+
+/*
+ * __ovfl_discard_verbose --
+ * Dump information about a discard overflow record.
+ */
+static int
+__ovfl_discard_verbose(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag)
+{
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 512, &tmp));
+
+ unpack = &_unpack;
+ __wt_cell_unpack(cell, unpack);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+ "discard: %s%s%p %s",
+ tag == NULL ? "" : tag,
+ tag == NULL ? "" : ": ",
+ page,
+ __wt_addr_string(session, unpack->data, unpack->size, tmp)));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_discard_dump --
+ * Debugging information.
+ */
+static void
+__ovfl_discard_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CELL **cellp;
+ WT_OVFL_TRACK *track;
+ size_t i;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+
+ track = page->modify->ovfl_track;
+ for (i = 0, cellp = track->discard;
+ i < track->discard_entries; ++i, ++cellp)
+ (void)__ovfl_discard_verbose(session, page, *cellp, "dump");
+}
+#endif
+
+/*
+ * __ovfl_discard_wrapup --
+ * Resolve the page's overflow discard list after a page is written.
+ */
+static int
+__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CELL **cellp;
+ WT_DECL_RET;
+ WT_OVFL_TRACK *track;
+ uint32_t i;
+
+ track = page->modify->ovfl_track;
+ for (i = 0, cellp = track->discard;
+ i < track->discard_entries; ++i, ++cellp) {
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_discard_verbose(
+ session, page, *cellp, "free"));
+
+ /* Discard each cell's overflow item. */
+ WT_RET(__wt_ovfl_discard(session, *cellp));
+ }
+
+ __wt_free(session, track->discard);
+ track->discard_entries = track->discard_allocated = 0;
+
+ return (ret);
+}
+
+/*
+ * __ovfl_discard_wrapup_err --
+ * Resolve the page's overflow discard list after an error occurs.
+ */
+static int
+__ovfl_discard_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TRACK *track;
+
+ track = page->modify->ovfl_track;
+
+ __wt_free(session, track->discard);
+ track->discard_entries = track->discard_allocated = 0;
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_discard_add --
+ * Add a new entry to the page's list of overflow records that have been
+ * discarded.
+ */
+int
+__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
+{
+ WT_OVFL_TRACK *track;
+
+ if (page->modify->ovfl_track == NULL)
+ WT_RET(__ovfl_track_init(session, page));
+
+ track = page->modify->ovfl_track;
+ WT_RET(__wt_realloc_def(session, &track->discard_allocated,
+ track->discard_entries + 1, &track->discard));
+ track->discard[track->discard_entries++] = cell;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_discard_verbose(session, page, cell, "add"));
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_discard_free --
+ * Free the page's list of discarded overflow record addresses.
+ */
+void
+__wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TRACK *track;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+
+ track = page->modify->ovfl_track;
+
+ __wt_free(session, track->discard);
+ track->discard_entries = track->discard_allocated = 0;
+}
+
+/*
+ * __ovfl_reuse_verbose --
+ * Dump information about a reuse overflow record.
+ */
+static int
+__ovfl_reuse_verbose(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+ "reuse: %s%s%p %s (%s%s%s) {%.*s}",
+ tag == NULL ? "" : tag,
+ tag == NULL ? "" : ": ",
+ page,
+ __wt_addr_string(
+ session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp),
+ F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "",
+ F_ISSET(reuse, WT_OVFL_REUSE_INUSE) &&
+ F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "",
+ F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "",
+ WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse)));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_reuse_dump --
+ * Debugging information.
+ */
+static void
+__ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_REUSE **head, *reuse;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ for (reuse = head[0]; reuse != NULL; reuse = reuse->next[0])
+ (void)__ovfl_reuse_verbose(session, page, reuse, "dump");
+}
+#endif
+
+/*
+ * __ovfl_reuse_skip_search --
+ * Return the first, not in-use, matching value in the overflow reuse list.
+ */
+static WT_OVFL_REUSE *
+__ovfl_reuse_skip_search(
+ WT_OVFL_REUSE **head, const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **e, *next;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Values are not unique, and it's possible to have long lists
+ * of identical overflow items. (We've seen it in benchmarks.)
+ * Move through a list of identical items at the current level
+ * as long as the next one is in-use, otherwise, drop down a
+ * level. When at the bottom level, return items if reusable,
+ * else NULL.
+ */
+ len = WT_MIN((*e)->value_size, value_size);
+ cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+ if (cmp == 0 && (*e)->value_size == value_size) {
+ if (i == 0)
+ return (F_ISSET(*e,
+ WT_OVFL_REUSE_INUSE) ? NULL : *e);
+ if ((next = (*e)->next[i]) == NULL ||
+ !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
+ next->value_size != len || memcmp(
+ WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ continue;
+ }
+
+ /*
+ * If the skiplist value is larger than the search value, or
+ * they compare equally and the skiplist value is longer than
+ * the search value, drop down a level, otherwise continue on
+ * this level.
+ */
+ if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __ovfl_reuse_skip_search_stack --
+ * Search an overflow reuse skiplist, returning an insert/remove stack.
+ */
+static void
+__ovfl_reuse_skip_search_stack(WT_OVFL_REUSE **head,
+ WT_OVFL_REUSE ***stack, const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **e;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ stack[i--] = e--;
+ continue;
+ }
+
+ /*
+ * If the skiplist value is larger than the search value, or
+ * they compare equally and the skiplist value is longer than
+ * the search value, drop down a level, otherwise continue on
+ * this level.
+ */
+ len = WT_MIN((*e)->value_size, value_size);
+ cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
+ if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size))
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+ }
+}
+
+/*
+ * __ovfl_reuse_wrapup --
+ * Resolve the page's overflow reuse list after a page is written.
+ */
+static int
+__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_OVFL_REUSE **e, **head, *reuse;
+ size_t incr, decr;
+ int i;
+
+ bm = S2BT(session)->bm;
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /*
+ * Discard any overflow records that aren't in-use, freeing underlying
+ * blocks.
+ *
+ * First, walk the overflow reuse lists (except for the lowest one),
+ * fixing up skiplist links.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+ for (e = &head[i]; *e != NULL;) {
+ if (F_ISSET(*e, WT_OVFL_REUSE_INUSE)) {
+ e = &(*e)->next[i];
+ continue;
+ }
+ *e = (*e)->next[i];
+ }
+
+ /*
+ * Second, discard any overflow record without an in-use flag, clear
+ * the flags for the next run.
+ *
+ * As part of the pass through the lowest level, figure out how much
+ * space we added/subtracted from the page, and update its footprint.
+ * We don't get it exactly correct because we don't know the depth of
+ * the skiplist here, but it's close enough, and figuring out the
+ * memory footprint change in the reconciliation wrapup code means
+ * fewer atomic updates and less code overall.
+ */
+ incr = decr = 0;
+ for (e = &head[0]; (reuse = *e) != NULL;) {
+ if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
+ if (F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED))
+ incr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+ reuse->addr_size + reuse->value_size;
+
+ F_CLR(reuse,
+ WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+ e = &(*e)->next[0];
+ continue;
+ }
+ *e = (*e)->next[0];
+
+ WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));
+ decr += WT_OVFL_SIZE(WT_OVFL_REUSE) +
+ reuse->addr_size + reuse->value_size;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(
+ __ovfl_reuse_verbose(session, page, reuse, "free"));
+ WT_RET(bm->free(
+ bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+ __wt_free(session, reuse);
+ }
+
+ if (incr > decr)
+ __wt_cache_page_inmem_incr(session, page, incr - decr);
+ if (decr > incr)
+ __wt_cache_page_inmem_decr(session, page, decr - incr);
+ return (0);
+}
+
+/*
+ * __ovfl_reuse_wrapup_err --
+ * Resolve the page's overflow reuse list after an error occurs.
+ */
+static int
+__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ WT_OVFL_REUSE **e, **head, *reuse;
+ int i;
+
+ bm = S2BT(session)->bm;
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /*
+ * Discard any overflow records that were just added, freeing underlying
+ * blocks.
+ *
+ * First, walk the overflow reuse lists (except for the lowest one),
+ * fixing up skiplist links.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+ for (e = &head[i]; *e != NULL;) {
+ if (!F_ISSET(*e, WT_OVFL_REUSE_JUST_ADDED)) {
+ e = &(*e)->next[i];
+ continue;
+ }
+ *e = (*e)->next[i];
+ }
+
+ /*
+ * Second, discard any overflow record with a just-added flag, clear the
+ * flags for the next run.
+ */
+ for (e = &head[0]; (reuse = *e) != NULL;) {
+ if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
+ F_CLR(reuse, WT_OVFL_REUSE_INUSE);
+ e = &(*e)->next[0];
+ continue;
+ }
+ *e = (*e)->next[0];
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(
+ __ovfl_reuse_verbose(session, page, reuse, "free"));
+ WT_TRET(bm->free(
+ bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
+ __wt_free(session, reuse);
+ }
+ return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_search --
+ * Search the page's list of overflow records for a match.
+ */
+int
+__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
+ uint8_t **addrp, size_t *addr_sizep,
+ const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **head, *reuse;
+
+ *addrp = NULL;
+ *addr_sizep = 0;
+
+ if (page->modify->ovfl_track == NULL)
+ return (0);
+
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /*
+ * The search function returns the first matching record in the list
+ * which does not have the in-use flag set, or NULL.
+ */
+ if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
+ return (0);
+
+ *addrp = WT_OVFL_REUSE_ADDR(reuse);
+ *addr_sizep = reuse->addr_size;
+ F_SET(reuse, WT_OVFL_REUSE_INUSE);
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
+ return (1);
+}
+
+/*
+ * __wt_ovfl_reuse_add --
+ * Add a new entry to the page's list of overflow records tracked for
+ * reuse.
+ */
+int
+__wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+ const uint8_t *addr, size_t addr_size,
+ const void *value, size_t value_size)
+{
+ WT_OVFL_REUSE **head, *reuse, **stack[WT_SKIP_MAXDEPTH];
+ size_t size;
+ u_int i, skipdepth;
+ uint8_t *p;
+
+ if (page->modify->ovfl_track == NULL)
+ WT_RET(__ovfl_track_init(session, page));
+
+ head = page->modify->ovfl_track->ovfl_reuse;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate the WT_OVFL_REUSE structure, next pointers for the skip
+ * list, room for the address and value, then copy everything into
+ * place.
+ *
+ * To minimize the WT_OVFL_REUSE structure size, the address offset
+ * and size are single bytes: that's safe because the address follows
+ * the structure (which can't be more than about 100B), and address
+ * cookies are limited to 255B.
+ */
+ size = sizeof(WT_OVFL_REUSE) +
+ skipdepth * sizeof(WT_OVFL_REUSE *) + addr_size + value_size;
+ WT_RET(__wt_calloc(session, 1, size, &reuse));
+ p = (uint8_t *)reuse +
+ sizeof(WT_OVFL_REUSE) + skipdepth * sizeof(WT_OVFL_REUSE *);
+ reuse->addr_offset = (uint8_t)WT_PTRDIFF(p, reuse);
+ reuse->addr_size = (uint8_t)addr_size;
+ memcpy(p, addr, addr_size);
+ p += addr_size;
+ reuse->value_offset = WT_PTRDIFF32(p, reuse);
+ reuse->value_size = WT_STORE_SIZE(value_size);
+ memcpy(p, value, value_size);
+ F_SET(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
+
+ /* Insert the new entry into the skiplist. */
+ __ovfl_reuse_skip_search_stack(head, stack, value, value_size);
+ for (i = 0; i < skipdepth; ++i) {
+ reuse->next[i] = *stack[i];
+ *stack[i] = reuse;
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_reuse_verbose(session, page, reuse, "add"));
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_reuse_free --
+ * Free the page's list of overflow records tracked for reuse.
+ */
+void
+__wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_REUSE *reuse;
+ WT_PAGE_MODIFY *mod;
+ void *next;
+
+ mod = page->modify;
+ if (mod == NULL || mod->ovfl_track == NULL)
+ return;
+
+ for (reuse = mod->ovfl_track->ovfl_reuse[0];
+ reuse != NULL; reuse = next) {
+ next = reuse->next[0];
+ __wt_free(session, reuse);
+ }
+}
+
+/*
+ * __ovfl_txnc_verbose --
+ * Dump information about a transaction-cached overflow record.
+ */
+static int
+__ovfl_txnc_verbose(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 64, &tmp));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
+ "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}",
+ tag == NULL ? "" : tag,
+ tag == NULL ? "" : ": ",
+ page,
+ __wt_addr_string(
+ session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp),
+ txnc->current,
+ WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc)));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+#if 0
+/*
+ * __ovfl_txnc_dump --
+ * Debugging information.
+ */
+static void
+__ovfl_txnc_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TXNC **head, *txnc;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return;
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ for (txnc = head[0]; txnc != NULL; txnc = txnc->next[0])
+ (void)__ovfl_txnc_verbose(session, page, txnc, "dump");
+}
+#endif
+
+/*
+ * __ovfl_txnc_skip_search --
+ * Return the first matching addr in the overflow transaction-cache list.
+ */
+static WT_OVFL_TXNC *
+__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size)
+{
+ WT_OVFL_TXNC **e;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Return any exact matches: we don't care in what search level
+ * we found a match.
+ */
+ len = WT_MIN((*e)->addr_size, addr_size);
+ cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+ if (cmp == 0 && (*e)->addr_size == addr_size)
+ return (*e);
+
+ /*
+ * If the skiplist address is larger than the search address, or
+ * they compare equally and the skiplist address is longer than
+ * the search address, drop down a level, otherwise continue on
+ * this level.
+ */
+ if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __ovfl_txnc_skip_search_stack --
+ * Search an overflow transaction-cache skiplist, returning an
+ * insert/remove stack.
+ */
+static void
+__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head,
+ WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size)
+{
+ WT_OVFL_TXNC **e;
+ size_t len;
+ int cmp, i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ stack[i--] = e--;
+ continue;
+ }
+
+ /*
+ * If the skiplist addr is larger than the search addr, or
+ * they compare equally and the skiplist addr is longer than
+ * the search addr, drop down a level, otherwise continue on
+ * this level.
+ */
+ len = WT_MIN((*e)->addr_size, addr_size);
+ cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
+ if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size))
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+ }
+}
+
+/*
+ * __ovfl_txnc_wrapup --
+ * Resolve the page's transaction-cache list.
+ */
+static int
+__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TXNC **e, **head, *txnc;
+ size_t decr;
+ int i;
+
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ /*
+ * Discard any transaction-cache records with transaction IDs earlier
+ * than any in the system.
+ *
+ * First, walk the overflow transaction-cache skip lists (except for
+ * the lowest level), fixing up links.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
+ for (e = &head[i]; *e != NULL;) {
+ if (!__wt_txn_visible_all(session, (*e)->current)) {
+ e = &(*e)->next[i];
+ continue;
+ }
+ *e = (*e)->next[i];
+ }
+
+ /* Second, discard any no longer needed transaction-cache records. */
+ decr = 0;
+ for (e = &head[0]; (txnc = *e) != NULL;) {
+ if (!__wt_txn_visible_all(session, txnc->current)) {
+ e = &(*e)->next[0];
+ continue;
+ }
+ *e = (*e)->next[0];
+
+ decr += WT_OVFL_SIZE(WT_OVFL_TXNC) +
+ txnc->addr_size + txnc->value_size;
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(
+ __ovfl_txnc_verbose(session, page, txnc, "free"));
+ __wt_free(session, txnc);
+ }
+
+ if (decr != 0)
+ __wt_cache_page_inmem_decr(session, page, decr);
+ return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_search --
+ * Search the page's list of transaction-cache overflow records for a
+ * match.
+ */
+int
+__wt_ovfl_txnc_search(
+ WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store)
+{
+ WT_OVFL_TXNC **head, *txnc;
+
+ if (page->modify->ovfl_track == NULL)
+ return (WT_NOTFOUND);
+
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ if ((txnc = __ovfl_txnc_skip_search(head, addr, addr_size)) == NULL)
+ return (WT_NOTFOUND);
+
+ store->data = WT_OVFL_TXNC_VALUE(txnc);
+ store->size = txnc->value_size;
+ return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_add --
+ * Add a new entry to the page's list of transaction-cached overflow
+ * records.
+ */
+int
+__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
+ const uint8_t *addr, size_t addr_size,
+ const void *value, size_t value_size)
+{
+ WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
+ size_t size;
+ u_int i, skipdepth;
+ uint8_t *p;
+
+ if (page->modify->ovfl_track == NULL)
+ WT_RET(__ovfl_track_init(session, page));
+
+ head = page->modify->ovfl_track->ovfl_txnc;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
+ * list, room for the address and value, then copy everything into
+ * place.
+ *
+ * To minimize the WT_OVFL_TXNC structure size, the address offset
+ * and size are single bytes: that's safe because the address follows
+ * the structure (which can't be more than about 100B), and address
+ * cookies are limited to 255B.
+ */
+ size = sizeof(WT_OVFL_TXNC) +
+ skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
+ WT_RET(__wt_calloc(session, 1, size, &txnc));
+ p = (uint8_t *)txnc +
+ sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
+ txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
+ txnc->addr_size = (uint8_t)addr_size;
+ memcpy(p, addr, addr_size);
+ p += addr_size;
+ txnc->value_offset = WT_PTRDIFF32(p, txnc);
+ txnc->value_size = WT_STORE_SIZE(value_size);
+ memcpy(p, value, value_size);
+ txnc->current = __wt_txn_new_id(session);
+
+ __wt_cache_page_inmem_incr(session, page,
+ WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size);
+
+ /* Insert the new entry into the skiplist. */
+ __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
+ for (i = 0; i < skipdepth; ++i) {
+ txnc->next[i] = *stack[i];
+ *stack[i] = txnc;
+ }
+
+ if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
+ WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));
+
+ return (0);
+}
+
+/*
+ * __wt_ovfl_txnc_free --
+ * Free the page's list of transaction-cached overflow records.
+ */
+void
+__wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_OVFL_TXNC *txnc;
+ WT_PAGE_MODIFY *mod;
+ void *next;
+
+ mod = page->modify;
+ if (mod == NULL || mod->ovfl_track == NULL)
+ return;
+
+ for (txnc = mod->ovfl_track->ovfl_txnc[0];
+ txnc != NULL; txnc = next) {
+ next = txnc->next[0];
+ __wt_free(session, txnc);
+ }
+}
+
+/*
+ * __wt_ovfl_track_wrapup --
+ * Resolve the page's overflow tracking on reconciliation success.
+ */
+int
+__wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_DECL_RET;
+ WT_OVFL_TRACK *track;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return (0);
+
+ track = page->modify->ovfl_track;
+ if (track->discard != NULL)
+ WT_RET(__ovfl_discard_wrapup(session, page));
+
+ if (track->ovfl_reuse[0] != NULL)
+ WT_RET(__ovfl_reuse_wrapup(session, page));
+
+ if (track->ovfl_txnc[0] != NULL) {
+ WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+ ret = __ovfl_txnc_wrapup(session, page);
+ WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+ }
+ return (0);
+}
+
+/*
+ * __wt_ovfl_track_wrapup_err --
+ * Resolve the page's overflow tracking on reconciliation error.
+ */
+int
+__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_DECL_RET;
+ WT_OVFL_TRACK *track;
+
+ if (page->modify == NULL || page->modify->ovfl_track == NULL)
+ return (0);
+
+ track = page->modify->ovfl_track;
+ if (track->discard != NULL)
+ WT_RET(__ovfl_discard_wrapup_err(session, page));
+
+ if (track->ovfl_reuse[0] != NULL)
+ WT_RET(__ovfl_reuse_wrapup_err(session, page));
+
+ if (track->ovfl_txnc[0] != NULL) {
+ WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
+ ret = __ovfl_txnc_wrapup(session, page);
+ WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/rec_write.c b/src/third_party/wiredtiger/src/btree/rec_write.c
new file mode 100644
index 00000000000..1b3a9a0898f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/rec_write.c
@@ -0,0 +1,5521 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+struct __rec_boundary; typedef struct __rec_boundary WT_BOUNDARY;
+struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY;
+struct __rec_kv; typedef struct __rec_kv WT_KV;
+
+/*
+ * Reconciliation is the process of taking an in-memory page, walking each entry
+ * in the page, building a backing disk image in a temporary buffer representing
+ * that information, and writing that buffer to disk. What could be simpler?
+ *
+ * WT_RECONCILE --
+ * Information tracking a single page reconciliation.
+ */
+typedef struct {
+ WT_REF *ref; /* Page being reconciled */
+ WT_PAGE *page;
+ uint32_t flags; /* Caller's configuration */
+
+ WT_ITEM dsk; /* Temporary disk-image buffer */
+
+ /* Track whether all changes to the page are written. */
+ uint64_t max_txn;
+ uint64_t skipped_txn;
+ uint32_t orig_write_gen;
+
+ /*
+ * If page updates are skipped because they are as yet unresolved, or
+ * the page has updates we cannot discard, the page is left "dirty":
+ * the page cannot be discarded and a subsequent reconciliation will
+ * be necessary to discard the page.
+ */
+ int leave_dirty;
+
+ /*
+ * Raw compression (don't get me started, as if normal reconciliation
+ * wasn't bad enough). If an application wants absolute control over
+ * what gets written to disk, we give it a list of byte strings and it
+ * gives us back an image that becomes a file block. Because we don't
+ * know the number of items we're storing in a block until we've done
+ * a lot of work, we turn off most compression: dictionary, copy-cell,
+ * prefix and row-store internal page suffix compression are all off.
+ */
+ int raw_compression;
+ uint32_t raw_max_slots; /* Raw compression array sizes */
+ uint32_t *raw_entries; /* Raw compression slot entries */
+ uint32_t *raw_offsets; /* Raw compression slot offsets */
+ uint64_t *raw_recnos; /* Raw compression recno count */
+ WT_ITEM raw_destination; /* Raw compression destination buffer */
+
+ /*
+ * Track if reconciliation has seen any overflow items. If a leaf page
+ * with no overflow items is written, the parent page's address cell is
+ * set to the leaf-no-overflow type. This means we can delete the leaf
+ * page without reading it because we don't have to discard any overflow
+ * items it might reference.
+ *
+ * The test test is per-page reconciliation, that is, once we see an
+ * overflow item on the page, all subsequent leaf pages written for the
+ * page will not be leaf-no-overflow type, regardless of whether or not
+ * they contain overflow items. In other words, leaf-no-overflow is not
+ * guaranteed to be set on every page that doesn't contain an overflow
+ * item, only that if it is set, the page contains no overflow items.
+ *
+ * The reason is because of raw compression: there's no easy/fast way to
+ * figure out if the rows selected by raw compression included overflow
+ * items, and the optimization isn't worth another pass over the data.
+ */
+ int ovfl_items;
+
+ /*
+ * Track if reconciliation of a row-store leaf page has seen empty (zero
+ * length) values. We don't write out anything for empty values, so if
+ * there are empty values on a page, we have to make two passes over the
+ * page when it's read to figure out how many keys it has, expensive in
+ * the common case of no empty values and (entries / 2) keys. Likewise,
+ * a page with only empty values is another common data set, and keys on
+ * that page will be equal to the number of entries. In both cases, set
+ * a flag in the page's on-disk header.
+ *
+ * The test is per-page reconciliation as described above for the
+ * overflow-item test.
+ */
+ int all_empty_value, any_empty_value;
+
+ /*
+ * Reconciliation gets tricky if we have to split a page, which happens
+ * when the disk image we create exceeds the page type's maximum disk
+ * image size.
+ *
+ * First, the sizes of the page we're building. If WiredTiger is doing
+ * page layout, page_size is the same as page_size_max. We accumulate
+ * the maximum page size of raw data and when we reach that size, we
+ * split the page into multiple chunks, eventually compressing those
+ * chunks. When the application is doing page layout (raw compression
+ * is configured), page_size can continue to grow past page_size_max,
+ * and we keep accumulating raw data until the raw compression callback
+ * accepts it.
+ */
+ uint32_t page_size; /* Current page size */
+ uint32_t page_size_max; /* Maximum on-disk page size */
+
+ /*
+ * Second, the split size: if we're doing the page layout, split to a
+ * smaller-than-maximum page size when a split is required so we don't
+ * repeatedly split a packed page.
+ */
+ uint32_t split_size; /* Split page size */
+
+ /*
+ * The problem with splits is we've done a lot of work by the time we
+ * realize we're going to have to split, we don't want to start over.
+ *
+ * To keep from having to start over when we hit the maximum page size,
+ * we track the page information when we approach a split boundary.
+ * If we eventually have to split, we walk this structure and pretend
+ * we were splitting all along. After that, we continue to append to
+ * this structure, and eventually walk it to create a new internal page
+ * that references all of our split pages.
+ */
+ struct __rec_boundary {
+ /*
+ * The start field records location in the initial split buffer,
+ * that is, the first byte of the split chunk recorded before we
+ * decide to split a page; the offset between the first byte of
+ * chunk[0] and the first byte of chunk[1] is chunk[0]'s length.
+ *
+ * Once we split a page, we stop filling in the start field, as
+ * we're writing the split chunks as we find them.
+ */
+ uint8_t *start; /* Split's first byte */
+
+ /*
+ * The recno and entries fields are the starting record number
+ * of the split chunk (for column-store splits), and the number
+ * of entries in the split chunk. These fields are used both
+ * to write the split chunk, and to create a new internal page
+ * to reference the split pages.
+ */
+ uint64_t recno; /* Split's starting record */
+ uint32_t entries; /* Split's entries */
+
+ WT_ADDR addr; /* Split's written location */
+ uint32_t size; /* Split's size */
+ uint32_t cksum; /* Split's checksum */
+ void *dsk; /* Split's disk image */
+
+ /*
+ * When busy pages get large, we need to be able to evict them
+ * even when they contain unresolved updates, or updates which
+ * cannot be evicted because of running transactions. In such
+ * cases, break the page into multiple blocks, write the blocks
+ * that can be evicted, saving lists of updates for blocks that
+ * cannot be evicted, then re-instantiate the blocks that cannot
+ * be evicted as new, in-memory pages, restoring the updates on
+ * those pages.
+ */
+ WT_UPD_SKIPPED *skip; /* Skipped updates */
+ uint32_t skip_next;
+ size_t skip_allocated;
+
+ /*
+ * The key for a row-store page; no column-store key is needed
+ * because the page's recno, stored in the recno field, is the
+ * column-store key.
+ */
+ WT_ITEM key; /* Promoted row-store key */
+
+ /*
+ * During wrapup, after reconciling the root page, we write a
+ * final block as part of a checkpoint. If raw compression
+ * was configured, that block may have already been compressed.
+ */
+ int already_compressed;
+ } *bnd; /* Saved boundaries */
+ uint32_t bnd_next; /* Next boundary slot */
+ uint32_t bnd_next_max; /* Maximum boundary slots used */
+ size_t bnd_entries; /* Total boundary slots */
+ size_t bnd_allocated; /* Bytes allocated */
+
+ /*
+ * We track the total number of page entries copied into split chunks
+ * so we can easily figure out how many entries in the current split
+ * chunk.
+ */
+ uint32_t total_entries; /* Total entries in splits */
+
+ /*
+ * And there's state information as to where in this process we are:
+ * (1) tracking split boundaries because we can still fit more split
+ * chunks into the maximum page size, (2) tracking the maximum page
+ * size boundary because we can't fit any more split chunks into the
+ * maximum page size, (3) not performing boundary checks because it's
+ * either not useful with the current page size configuration, or
+ * because we've already been forced to split.
+ */
+ enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */
+ SPLIT_MAX=1, /* Next: the maximum page boundary */
+ SPLIT_TRACKING_OFF=2, /* No boundary checks */
+ SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */
+ bnd_state;
+
+ /*
+ * We track current information about the current record number, the
+ * number of entries copied into the temporary buffer, where we are
+ * in the temporary buffer, and how much memory remains. Those items
+ * are packaged here rather than passing pointers to stack locations
+ * around the code.
+ */
+ uint64_t recno; /* Current record number */
+ uint32_t entries; /* Current number of entries */
+ uint8_t *first_free; /* Current first free byte */
+ size_t space_avail; /* Remaining space in this chunk */
+
+ /*
+ * While reviewing updates for each page, we store skipped updates here,
+ * and then move them to per-block areas as the blocks are defined.
+ */
+ WT_UPD_SKIPPED *skip; /* Skipped updates */
+ uint32_t skip_next;
+ size_t skip_allocated;
+
+ /*
+ * We don't need to keep the 0th key around on internal pages, the
+ * search code ignores them as nothing can sort less by definition.
+ * There's some trickiness here, see the code for comments on how
+ * these fields work.
+ */
+ int cell_zero; /* Row-store internal page 0th key */
+
+ /*
+ * WT_DICTIONARY --
+ * We optionally build a dictionary of row-store values for leaf
+ * pages. Where two value cells are identical, only write the value
+ * once, the second and subsequent copies point to the original cell.
+ * The dictionary is fixed size, but organized in a skip-list to make
+ * searches faster.
+ */
+ struct __rec_dictionary {
+ uint64_t hash; /* Hash value */
+ void *cell; /* Matching cell */
+
+ u_int depth; /* Skiplist */
+ WT_DICTIONARY *next[0];
+ } **dictionary; /* Dictionary */
+ u_int dictionary_next, dictionary_slots; /* Next, max entries */
+ /* Skiplist head. */
+ WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
+
+ /*
+ * WT_KV--
+ * An on-page key/value item we're building.
+ */
+ struct __rec_kv {
+ WT_ITEM buf; /* Data */
+ WT_CELL cell; /* Cell and cell's length */
+ size_t cell_len;
+ size_t len; /* Total length of cell + data */
+ } k, v; /* Key/Value being built */
+
+ WT_ITEM *cur, _cur; /* Key/Value being built */
+ WT_ITEM *last, _last; /* Last key/value built */
+
+ int key_pfx_compress; /* If can prefix-compress next key */
+ int key_pfx_compress_conf; /* If prefix compression configured */
+ int key_sfx_compress; /* If can suffix-compress next key */
+ int key_sfx_compress_conf; /* If suffix compression configured */
+
+ int is_bulk_load; /* If it's a bulk load */
+
+ WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
+
+ int tested_ref_state; /* Debugging information */
+} WT_RECONCILE;
+
+static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
+static void __rec_cell_build_addr(
+ WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
+static int __rec_cell_build_int_key(WT_SESSION_IMPL *,
+ WT_RECONCILE *, const void *, size_t, int *);
+static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
+ WT_RECONCILE *, const void *, size_t, int *);
+static int __rec_cell_build_ovfl(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
+static int __rec_cell_build_val(WT_SESSION_IMPL *,
+ WT_RECONCILE *, const void *, size_t, uint64_t);
+static int __rec_child_deleted(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *);
+static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_fix_slvg(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_var(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
+ WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t);
+static int __rec_destroy_session(WT_SESSION_IMPL *);
+static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
+static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_row_leaf(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+static int __rec_row_leaf_insert(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *);
+static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
+static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_split_row_promote(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
+static int __rec_split_write(WT_SESSION_IMPL *,
+ WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int __rec_write_init(WT_SESSION_IMPL *,
+ WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_write_wrapup_err(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+
+static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
+static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
+static int __rec_dictionary_lookup(
+ WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
+static void __rec_dictionary_reset(WT_RECONCILE *);
+
+/*
+ * __wt_rec_write --
+ * Reconcile an in-memory page into its on-disk format, and write it.
+ */
+int
+__wt_rec_write(WT_SESSION_IMPL *session,
+ WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
+ WT_RECONCILE *r;
+ int locked;
+
+ conn = S2C(session);
+ page = ref->page;
+ mod = page->modify;
+
+ /* We're shouldn't get called with a clean page, that's an error. */
+ if (!__wt_page_is_modified(page))
+ WT_RET_MSG(session, WT_ERROR,
+ "Attempt to reconcile a clean page.");
+
+ WT_RET(__wt_verbose(session,
+ WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
+ WT_STAT_FAST_CONN_INCR(session, rec_pages);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages);
+ if (LF_ISSET(WT_EVICTING)) {
+ WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+ }
+
+ /* Record the most recent transaction ID we will *not* write. */
+ mod->disk_snap_min = session->txn.snap_min;
+
+ /* Initialize the reconciliation structure for each new run. */
+ WT_RET(__rec_write_init(
+ session, ref, flags, salvage, &session->reconcile));
+ r = session->reconcile;
+
+ /*
+ * The compaction process looks at the page's modification information;
+ * if compaction is running, lock the page down.
+ *
+ * Otherwise, flip on the scanning flag: obsolete updates cannot be
+ * freed while reconciliation is in progress.
+ */
+ locked = 0;
+ if (conn->compact_in_memory_pass) {
+ locked = 1;
+ WT_PAGE_LOCK(session, page);
+ } else
+ for (;;) {
+ F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+ if (ret == 0)
+ break;
+ __wt_yield();
+ }
+
+ /* Reconcile the page. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ if (salvage != NULL)
+ ret = __rec_col_fix_slvg(session, r, page, salvage);
+ else
+ ret = __rec_col_fix(session, r, page);
+ break;
+ case WT_PAGE_COL_INT:
+ ret = __rec_col_int(session, r, page);
+ break;
+ case WT_PAGE_COL_VAR:
+ ret = __rec_col_var(session, r, page, salvage);
+ break;
+ case WT_PAGE_ROW_INT:
+ ret = __rec_row_int(session, r, page);
+ break;
+ case WT_PAGE_ROW_LEAF:
+ ret = __rec_row_leaf(session, r, page, salvage);
+ break;
+ WT_ILLEGAL_VALUE_SET(session);
+ }
+
+ /* Wrap up the page reconciliation. */
+ if (ret == 0)
+ ret = __rec_write_wrapup(session, r, page);
+ else
+ WT_TRET(__rec_write_wrapup_err(session, r, page));
+
+ /* Release the page lock if we're holding one. */
+ if (locked)
+ WT_PAGE_UNLOCK(session, page);
+ else
+ F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+
+ /*
+ * Clean up the boundary structures: some workloads result in millions
+ * of these structures, and if associated with some random session that
+ * got roped into doing forced eviction, they won't be discarded for the
+ * life of the session.
+ */
+ __rec_bnd_cleanup(session, r, 0);
+
+ WT_RET(ret);
+
+ /*
+ * Root pages are special, splits have to be done, we can't put it off
+ * as the parent's problem any more.
+ */
+ if (__wt_ref_is_root(ref))
+ return (__rec_root_write(session, page, flags));
+
+ /*
+ * Otherwise, mark the page's parent dirty.
+ * Don't mark the tree dirty: if this reconciliation is in service of a
+ * checkpoint, it's cleared the tree's dirty flag, and we don't want to
+ * set it again as part of that walk.
+ */
+ return (__wt_page_parent_modify_set(session, ref, 1));
+}
+
+/*
+ * __rec_root_write --
+ * Handle the write of a root page.
+ */
+static int
+__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_PAGE *next;
+ WT_PAGE_INDEX *pindex;
+ WT_PAGE_MODIFY *mod;
+ WT_REF fake_ref;
+ uint32_t i;
+
+ mod = page->modify;
+
+ /*
+ * If a single root page was written (either an empty page or there was
+ * a 1-for-1 page swap), we've written root and checkpoint, we're done.
+ * If the root page split, write the resulting WT_REF array. We already
+ * have an infrastructure for writing pages, create a fake root page and
+ * write it instead of adding code to write blocks based on the list of
+ * blocks resulting from a multiblock reconciliation.
+ */
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY: /* Page is empty */
+ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ return (0);
+ case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_SPLIT,
+ "root page split -> %" PRIu32 " pages", mod->mod_multi_entries));
+
+ /*
+ * Create a new root page, initialize the array of child references,
+ * mark it dirty, then write it.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ WT_RET(__wt_page_alloc(session,
+ WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 1, &next));
+ break;
+ case WT_PAGE_ROW_INT:
+ WT_RET(__wt_page_alloc(session,
+ WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 1, &next));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ pindex = WT_INTL_INDEX_COPY(next);
+ for (i = 0; i < mod->mod_multi_entries; ++i) {
+ WT_ERR(__wt_multi_to_ref(session,
+ next, &mod->mod_multi[i], &pindex->index[i], NULL));
+ pindex->index[i]->home = next;
+ }
+
+ /*
+ * We maintain a list of pages written for the root in order to free the
+ * backing blocks the next time the root is written.
+ */
+ mod->mod_root_split = next;
+
+ WT_ERR(__wt_page_modify_init(session, next));
+ __wt_page_only_modify_set(session, next);
+
+ /*
+ * Fake up a reference structure, and write the next root page.
+ */
+ __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
+ return (__wt_rec_write(session, &fake_ref, NULL, flags));
+
+err: __wt_page_out(session, &next);
+ return (ret);
+}
+
+/*
+ * __rec_raw_compression_config --
+ * Configure raw compression.
+ */
+static inline int
+__rec_raw_compression_config(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Check if raw compression configured. */
+ if (btree->compressor == NULL ||
+ btree->compressor->compress_raw == NULL)
+ return (0);
+
+ /* Only for row-store and variable-length column-store objects. */
+ if (page->type == WT_PAGE_COL_FIX)
+ return (0);
+
+ /*
+ * Raw compression cannot support dictionary compression. (Technically,
+ * we could still use the raw callback on column-store variable length
+ * internal pages with dictionary compression configured, because
+ * dictionary compression only applies to column-store leaf pages, but
+ * that seems an unlikely use case.)
+ */
+ if (btree->dictionary != 0)
+ return (0);
+
+ /* Raw compression cannot support prefix compression. */
+ if (btree->prefix_compression != 0)
+ return (0);
+
+ /*
+ * Raw compression is also turned off during salvage: we can't allow
+ * pages to split during salvage, raw compression has no point if it
+ * can't manipulate the page size.
+ */
+ if (salvage != NULL)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * __rec_write_init --
+ * Initialize the reconciliation structure.
+ */
+static int
+__rec_write_init(WT_SESSION_IMPL *session,
+ WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ WT_RECONCILE *r;
+
+ btree = S2BT(session);
+ page = ref->page;
+
+ if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
+ WT_RET(__wt_calloc_def(session, 1, &r));
+
+ *(WT_RECONCILE **)reconcilep = r;
+ session->reconcile_cleanup = __rec_destroy_session;
+
+ /* Connect pointers/buffers. */
+ r->cur = &r->_cur;
+ r->last = &r->_last;
+
+ /* Disk buffers need to be aligned for writing. */
+ F_SET(&r->dsk, WT_ITEM_ALIGNED);
+ }
+
+ /* Remember the configuration. */
+ r->ref = ref;
+ r->page = page;
+ r->flags = flags;
+
+ /* Track if the page can be marked clean. */
+ r->leave_dirty = 0;
+
+ /* Raw compression. */
+ r->raw_compression =
+ __rec_raw_compression_config(session, page, salvage);
+ r->raw_destination.flags = WT_ITEM_ALIGNED;
+
+ /* Track overflow items. */
+ r->ovfl_items = 0;
+
+ /* Track empty values. */
+ r->all_empty_value = 1;
+ r->any_empty_value = 0;
+
+ /* The list of cached, skipped updates. */
+ r->skip_next = 0;
+
+ /*
+ * Dictionary compression only writes repeated values once. We grow
+ * the dictionary as necessary, always using the largest size we've
+ * seen.
+ *
+ * Reset the dictionary.
+ *
+ * Sanity check the size: 100 slots is the smallest dictionary we use.
+ */
+ if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots)
+ WT_RET(__rec_dictionary_init(session,
+ r, btree->dictionary < 100 ? 100 : btree->dictionary));
+ __rec_dictionary_reset(r);
+
+ /*
+ * Suffix compression shortens internal page keys by discarding trailing
+ * bytes that aren't necessary for tree navigation. We don't do suffix
+ * compression if there is a custom collator because we don't know what
+ * bytes a custom collator might use. Some custom collators (for
+ * example, a collator implementing reverse ordering of strings), won't
+ * have any problem with suffix compression: if there's ever a reason to
+ * implement suffix compression for custom collators, we can add a
+ * setting to the collator, configured when the collator is added, that
+ * turns on suffix compression.
+ *
+ * The raw compression routines don't even consider suffix compression,
+ * but it doesn't hurt to confirm that.
+ */
+ r->key_sfx_compress_conf = 0;
+ if (btree->collator == NULL &&
+ btree->internal_key_truncate && !r->raw_compression)
+ r->key_sfx_compress_conf = 1;
+
+ /*
+ * Prefix compression discards repeated prefix bytes from row-store leaf
+ * page keys.
+ */
+ r->key_pfx_compress_conf = 0;
+ if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF)
+ r->key_pfx_compress_conf = 1;
+
+ r->salvage = salvage;
+
+ /* Save the page's write generation before reading the page. */
+ WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Running transactions may update the page after we write it, so
+ * this is the highest ID we can be confident we will see.
+ */
+ r->skipped_txn = S2C(session)->txn_global.last_running;
+
+ return (0);
+}
+
+/*
+ * __rec_destroy --
+ * Clean up the reconciliation structure.
+ */
+static void
+__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
+{
+ WT_RECONCILE *r;
+
+ if ((r = *(WT_RECONCILE **)reconcilep) == NULL)
+ return;
+ *(WT_RECONCILE **)reconcilep = NULL;
+
+ __wt_buf_free(session, &r->dsk);
+
+ __wt_free(session, r->raw_entries);
+ __wt_free(session, r->raw_offsets);
+ __wt_free(session, r->raw_recnos);
+ __wt_buf_free(session, &r->raw_destination);
+
+ __rec_bnd_cleanup(session, r, 1);
+
+ __wt_free(session, r->skip);
+
+ __wt_buf_free(session, &r->k.buf);
+ __wt_buf_free(session, &r->v.buf);
+ __wt_buf_free(session, &r->_cur);
+ __wt_buf_free(session, &r->_last);
+
+ __rec_dictionary_free(session, r);
+
+ __wt_free(session, r);
+}
+
+/*
+ * __rec_destroy_session --
+ * Clean up the reconciliation structure, session version.
+ */
+static int
+__rec_destroy_session(WT_SESSION_IMPL *session)
+{
+ __rec_destroy(session, &session->reconcile);
+ return (0);
+}
+
+/*
+ * __rec_bnd_cleanup --
+ * Cleanup the boundary structure information.
+ */
+static void
+__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
+{
+ WT_BOUNDARY *bnd;
+ uint32_t i, last_used;
+
+ if (r->bnd == NULL)
+ return;
+
+ /*
+ * Free the boundary structures' memory. In the case of normal cleanup,
+ * discard any memory we won't reuse in the next reconciliation; in the
+ * case of destruction, discard everything.
+ *
+ * During some big-page evictions we have seen boundary arrays that have
+ * millions of elements. That should not be a normal event, but if the
+ * memory is associated with a random session, it won't be discarded
+ * until the session is closed. If there are more than 10,000 boundary
+ * structure elements, destroy the boundary array and we'll start over.
+ */
+ if (destroy || r->bnd_entries > 10 * 1000) {
+ for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
+ __wt_free(session, bnd->addr.addr);
+ __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->skip);
+ __wt_buf_free(session, &bnd->key);
+ }
+ __wt_free(session, r->bnd);
+ r->bnd_next = 0;
+ r->bnd_entries = r->bnd_allocated = 0;
+ } else {
+ /*
+ * The boundary-next field points to the next boundary structure
+ * we were going to use, but there's no requirement that value
+ * be incremented before reconciliation updates the structure it
+ * points to, that is, there's no guarantee elements of the next
+ * boundary structure are still unchanged. Be defensive, clean
+ * up the "next" structure as well as the ones we know we used.
+ */
+ last_used = r->bnd_next;
+ if (last_used < r->bnd_entries)
+ ++last_used;
+ for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
+ __wt_free(session, bnd->addr.addr);
+ __wt_free(session, bnd->dsk);
+ __wt_free(session, bnd->skip);
+ }
+ }
+}
+
+/*
+ * __rec_skip_update_save --
+ * Save a skipped WT_UPDATE list for later restoration.
+ */
+static int
+__rec_skip_update_save(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+{
+ WT_RET(__wt_realloc_def(
+ session, &r->skip_allocated, r->skip_next + 1, &r->skip));
+ r->skip[r->skip_next].ins = ins;
+ r->skip[r->skip_next].rip = rip;
+ ++r->skip_next;
+ return (0);
+}
+
+/*
+ * __rec_skip_update_move --
+ * Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * block's list.
+ */
+static int
+__rec_skip_update_move(
+ WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+{
+ WT_RET(__wt_realloc_def(
+ session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
+ bnd->skip[bnd->skip_next] = *skip;
+ ++bnd->skip_next;
+
+ skip->ins = NULL;
+ skip->rip = NULL;
+ return (0);
+}
+
+/*
+ * __rec_txn_read --
+ * Return the first visible update in a list (or NULL if none are visible),
+ * set a flag if any updates were skipped, track the maximum transaction ID on
+ * the page.
+ */
+static inline int
+__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
+{
+ WT_ITEM ovfl;
+ WT_PAGE *page;
+ WT_UPDATE *upd, *upd_list, *upd_ovfl;
+ size_t notused;
+ uint64_t max_txn, min_txn, txnid;
+ int skipped;
+
+ *updp = NULL;
+
+ page = r->page;
+
+ /*
+ * If we're called with an WT_INSERT reference, use its WT_UPDATE
+ * list, else is an on-page row-store WT_UPDATE list.
+ */
+ upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd;
+ skipped = 0;
+
+ for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
+ upd != NULL; upd = upd->next) {
+ if ((txnid = upd->txnid) == WT_TXN_ABORTED)
+ continue;
+
+ /* Track the largest/smallest transaction IDs on the list. */
+ if (TXNID_LT(max_txn, txnid))
+ max_txn = txnid;
+ if (TXNID_LT(txnid, min_txn))
+ min_txn = txnid;
+ if (TXNID_LT(txnid, r->skipped_txn) &&
+ !__wt_txn_visible_all(session, txnid))
+ r->skipped_txn = txnid;
+
+ /*
+ * Record whether any updates were skipped on the way to finding
+ * the first visible update.
+ *
+ * If updates were skipped before the one being written, future
+ * reads without intervening modifications to the page could
+ * see a different value; if no updates were skipped, the page
+ * can safely be marked clean and does not need to be
+ * reconciled until modified again.
+ */
+ if (*updp == NULL) {
+ if (__wt_txn_visible(session, txnid))
+ *updp = upd;
+ else
+ skipped = 1;
+ }
+ }
+
+ /*
+ * Track the maximum transaction ID in the page. We store this in the
+ * page at the end of reconciliation if no updates are skipped, it's
+ * used to avoid evicting clean pages from memory with changes required
+ * to satisfy a snapshot read.
+ */
+ if (TXNID_LT(r->max_txn, max_txn))
+ r->max_txn = max_txn;
+
+ /*
+ * If all updates are globally visible and no updates were skipped, the
+ * page can be marked clean and we're done, regardless of whether we're
+ * evicting or checkpointing.
+ *
+ * The oldest transaction ID may have moved while we were scanning the
+ * page, so it is possible to skip an update but then find that by the
+ * end of the scan, all updates are stable.
+ */
+ if (__wt_txn_visible_all(session, max_txn) && !skipped)
+ return (0);
+
+ /*
+ * If some updates are not globally visible, or were skipped, the page
+ * cannot be marked clean.
+ */
+ r->leave_dirty = 1;
+
+ /* If we're not evicting, we're done, we know what we'll write. */
+ if (!F_ISSET(r, WT_EVICTING))
+ return (0);
+
+ /* In some cases, there had better not be any updates we can't write. */
+ if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
+
+ /*
+ * If evicting and we aren't able to save/restore the not-yet-visible
+ * updates, the page can't be evicted.
+ */
+ if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
+ return (EBUSY);
+
+ /*
+ * Evicting a page with not-yet-visible updates: save and restore the
+ * list of updates on a newly instantiated page.
+ *
+ * The order of the updates on the list matters so we can't move only
+ * the unresolved updates, we have to move the entire update list.
+ *
+ * Clear the returned update so our caller ignores the key/value pair
+ * in the case of an insert/append entry (everything we need is in the
+ * update list), and otherwise writes the original on-page key/value
+ * pair to which the update list applies.
+ */
+ *updp = NULL;
+
+ /*
+ * Handle the case were we don't want to write an original on-page value
+ * item to disk because it's been updated or removed.
+ *
+ * Here's the deal: an overflow value was updated or removed and its
+ * backing blocks freed. If any transaction in the system might still
+ * read the value, a copy was cached in page reconciliation tracking
+ * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction
+ * then chose the page and we're splitting it up in order to push parts
+ * of it out of memory.
+ *
+ * We could write the original on-page value item to disk... if we had
+ * a copy. The cache may not have a copy (a globally visible update
+ * would have kept a value from ever being cached), or an update that
+ * subsequent became globally visible could cause a cached value to be
+ * discarded. Either way, once there's a globally visible update, we
+ * may not have the value.
+ *
+ * Fortunately, if there's a globally visible update we don't care about
+ * the original version, so we simply ignore it, no transaction can ever
+ * try and read it. If there isn't a globally visible update, there had
+ * better be a cached value.
+ *
+ * In the latter case, we could write the value out to disk, but (1) we
+ * are planning on re-instantiating this page in memory, it isn't going
+ * to disk, and (2) the value item is eventually going to be discarded,
+ * that seems like a waste of a write. Instead, find the cached value
+ * and append it to the update list we're saving for later restoration.
+ */
+ if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+ !__wt_txn_visible_all(session, min_txn)) {
+ WT_RET(__wt_ovfl_txnc_search(
+ page, vpack->data, vpack->size, &ovfl));
+ /*
+ * Create an update structure with an impossibly low transaction
+ * ID and append it to the update list we're about to save.
+ * Restoring that update list when this page is re-instantiated
+ * creates an update for the key/value pair visible to every
+ * running transaction in the system, ensuring the on-page value
+ * will be ignored.
+ */
+ WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
+ upd_ovfl->txnid = WT_TXN_NONE;
+ for (upd = upd_list; upd->next != NULL; upd = upd->next)
+ ;
+ upd->next = upd_ovfl;
+ }
+
+ return (__rec_skip_update_save(session, r, ins, rip));
+}
+
+/*
+ * CHILD_RELEASE --
+ * Macros to clean up during internal-page reconciliation, releasing the
+ * hazard pointer we're holding on child pages.
+ */
+#undef CHILD_RELEASE
+#define CHILD_RELEASE(session, hazard, ref) do { \
+ if (hazard) { \
+ hazard = 0; \
+ WT_TRET( \
+ __wt_page_release(session, ref, WT_READ_NO_EVICT)); \
+ } \
+} while (0)
+#undef CHILD_RELEASE_ERR
+#define CHILD_RELEASE_ERR(session, hazard, ref) do { \
+ CHILD_RELEASE(session, hazard, ref); \
+ WT_ERR(ret); \
+} while (0)
+
+/*
+ * __rec_child_modify --
+ * Return if the internal page's child references any modifications.
+ */
+static int
+__rec_child_modify(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_REF *ref, int *hazardp, int *statep)
+{
+ WT_DECL_RET;
+ WT_PAGE_MODIFY *mod;
+
+ /* We may acquire a hazard pointer our caller must release. */
+ *hazardp = 0;
+
+#define WT_CHILD_IGNORE 1 /* Deleted child: ignore */
+#define WT_CHILD_MODIFIED 2 /* Modified child */
+#define WT_CHILD_PROXY 3 /* Deleted child: proxy */
+ *statep = 0;
+
+ /*
+ * This function is called when walking an internal page to decide how
+ * to handle child pages referenced by the internal page, specifically
+ * if the child page is to be merged into its parent.
+ *
+ * Internal pages are reconciled for two reasons: first, when evicting
+ * an internal page, second by the checkpoint code when writing internal
+ * pages. During eviction, the subtree is locked down so all pages
+ * should be in the WT_REF_DISK or WT_REF_LOCKED state. During
+ * checkpoint, any eviction that might affect our review of an internal
+ * page is prohibited, however, as the subtree is not reserved for our
+ * exclusive use, there are other page states that must be considered.
+ */
+ for (;; __wt_yield())
+ switch (r->tested_ref_state = ref->state) {
+ case WT_REF_DISK:
+ /* On disk, not modified by definition. */
+ goto done;
+
+ case WT_REF_DELETED:
+ /*
+ * The child is in a deleted state.
+ *
+ * It's possible the state could change underneath us as
+ * the page is read in, and we can race between checking
+ * for a deleted state and looking at the transaction ID
+ * to see if the delete is visible to us. Lock down the
+ * structure.
+ */
+ if (!WT_ATOMIC_CAS4(
+ ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ break;
+ ret = __rec_child_deleted(session, r, ref, statep);
+ WT_PUBLISH(ref->state, WT_REF_DELETED);
+ goto done;
+
+ case WT_REF_LOCKED:
+ /*
+ * Locked.
+ *
+ * If evicting, the evicted page's subtree, including
+ * this child, was selected for eviction by us and the
+ * state is stable until we reset it, it's an in-memory
+ * state. This is the expected state for a child being
+ * merged into a page (where the page was selected by
+ * the eviction server for eviction).
+ */
+ if (F_ISSET(r, WT_EVICTING))
+ goto in_memory;
+
+ /*
+ * If called during checkpoint, the child is being
+ * considered by the eviction server or the child is a
+ * fast-delete page being read. The eviction may have
+ * started before the checkpoint and so we must wait
+ * for the eviction to be resolved. I suspect we could
+ * handle fast-delete reads, but we can't distinguish
+ * between the two and fast-delete reads aren't expected
+ * to be common.
+ */
+ break;
+
+ case WT_REF_MEM:
+ /*
+ * In memory.
+ *
+ * If evicting, the evicted page's subtree, including
+ * this child, was selected for eviction by us and the
+ * state is stable until we reset it, it's an in-memory
+ * state. This is the expected state for a child being
+ * merged into a page (where the page belongs to a file
+ * being discarded from the cache during close).
+ */
+ if (F_ISSET(r, WT_EVICTING))
+ goto in_memory;
+
+ /*
+ * If called during checkpoint, acquire a hazard pointer
+ * so the child isn't evicted, it's an in-memory case.
+ *
+ * This call cannot return split/restart, dirty page
+ * eviction is shutout during checkpoint, all splits in
+ * process will have completed before we walk any pages
+ * for checkpoint.
+ */
+ if ((ret = __wt_page_in(session, ref,
+ WT_READ_CACHE | WT_READ_NO_EVICT |
+ WT_READ_NO_GEN | WT_READ_NO_WAIT)) == WT_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ *hazardp = 1;
+ goto in_memory;
+
+ case WT_REF_READING:
+ /*
+ * Being read, not modified by definition.
+ *
+ * We should never be here during eviction, a child page
+ * in this state within an evicted page's subtree would
+ * have caused normally eviction to fail, and exclusive
+ * eviction shouldn't ever see pages being read.
+ */
+ WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ goto done;
+
+ case WT_REF_SPLIT:
+ /*
+ * The page was split out from under us.
+ *
+ * We should never be here during eviction, a child page
+ * in this state within an evicted page's subtree would
+ * have caused eviction to fail.
+ *
+ * We should never be here during checkpoint, dirty page
+ * eviction is shutout during checkpoint, all splits in
+ * process will have completed before we walk any pages
+ * for checkpoint.
+ */
+ WT_ASSERT(session, ref->state != WT_REF_SPLIT);
+ /* FALLTHROUGH */
+
+ WT_ILLEGAL_VALUE(session);
+ }
+
+in_memory:
+ /*
+ * In-memory states: the child is potentially modified if the page's
+ * modify structure has been instantiated. If the modify structure
+ * exists and the page has actually been modified, set that state.
+ * If that's not the case, we would normally use the original cell's
+ * disk address as our reference, but, if we're forced to instantiate
+ * a deleted child page and it's never modified, we end up here with
+ * a page that has a modify structure, no modifications, and no disk
+ * address. Ignore those pages, they're not modified and there is no
+ * reason to write the cell.
+ */
+ mod = ref->page->modify;
+ if (mod != NULL && mod->flags != 0)
+ *statep = WT_CHILD_MODIFIED;
+ else if (ref->addr == NULL) {
+ *statep = WT_CHILD_IGNORE;
+ CHILD_RELEASE(session, *hazardp, ref);
+ }
+
+done: WT_HAVE_DIAGNOSTIC_YIELD;
+ return (ret);
+}
+
+/*
+ * __rec_child_deleted --
+ * Handle pages with leaf pages in the WT_REF_DELETED state.
+ */
+static int
+__rec_child_deleted(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
+{
+ WT_BM *bm;
+ WT_PAGE_DELETED *page_del;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ bm = S2BT(session)->bm;
+ page_del = ref->page_del;
+
+ /*
+ * Internal pages with child leaf pages in the WT_REF_DELETED state are
+ * a special case during reconciliation. First, if the deletion was a
+ * result of a session truncate call, the deletion may not be visible to
+ * us. In that case, we proceed as with any change that's not visible
+ * during reconciliation by setting the skipped flag and ignoring the
+ * change for the purposes of writing the internal page.
+ *
+ * In this case, there must be an associated page-deleted structure, and
+ * it holds the transaction ID we care about.
+ */
+ if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
+ /*
+ * In some cases, there had better not be any updates we can't
+ * write.
+ */
+ if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
+
+ /* If this page cannot be evicted, quit now. */
+ if (F_ISSET(r, WT_EVICTING))
+ return (EBUSY);
+ }
+
+ /*
+ * The deletion is visible to us, deal with any underlying disk blocks.
+ *
+ * First, check to see if there is an address associated with this leaf:
+ * if there isn't, we're done, the underlying page is already gone. If
+ * the page still exists, check for any transactions in the system that
+ * might want to see the page's state before it's deleted.
+ *
+ * If any such transactions exist, we cannot discard the underlying leaf
+ * page to the block manager because the transaction may eventually read
+ * it. However, this write might be part of a checkpoint, and should we
+ * recover to that checkpoint, we'll need to delete the leaf page, else
+ * we'd leak it. The solution is to write a proxy cell on the internal
+ * page ensuring the leaf page is eventually discarded.
+ *
+ * If no such transactions exist, we can discard the leaf page to the
+ * block manager and no cell needs to be written at all. We do this
+ * outside of the underlying tracking routines because this action is
+ * permanent and irrevocable. (Clearing the address means we've lost
+ * track of the disk address in a permanent way. This is safe because
+ * there's no path to reading the leaf page again: if there's ever a
+ * read into this part of the name space again, the cache read function
+ * instantiates an entirely new page.)
+ */
+ if (ref->addr != NULL &&
+ (page_del == NULL ||
+ __wt_txn_visible_all(session, page_del->txnid))) {
+ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
+ WT_RET(bm->free(bm, session, addr, addr_size));
+
+ if (__wt_off_page(ref->home, ref->addr)) {
+ __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+ ref->addr = NULL;
+ }
+
+ /*
+ * Minor memory cleanup: if a truncate call deleted this page and we
+ * were ever forced to instantiate the page in memory, we would have
+ * built a list of updates in the page reference in order to be able
+ * to abort the truncate. It's a cheap test to make that memory go
+ * away, we do it here because there's really nowhere else we do the
+ * checks. In short, if we have such a list, and the backing address
+ * blocks are gone, there can't be any transaction that can abort.
+ */
+ if (ref->addr == NULL && page_del != NULL) {
+ __wt_free(session, ref->page_del->update_list);
+ __wt_free(session, ref->page_del);
+ }
+
+ /*
+ * If there's still a disk address, then we have to write a proxy
+ * record, otherwise, we can safely ignore this child page.
+ */
+ *statep = ref->addr == NULL ? WT_CHILD_IGNORE : WT_CHILD_PROXY;
+ return (0);
+}
+
+/*
+ * __rec_incr --
+ * Update the memory tracking structure for a set of new entries.
+ */
+static inline void
+__rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size)
+{
+ /*
+ * The buffer code is fragile and prone to off-by-one errors -- check
+ * for overflow in diagnostic mode.
+ */
+ WT_ASSERT(session, r->space_avail >= size);
+ WT_ASSERT(session,
+ WT_BLOCK_FITS(r->first_free, size, r->dsk.mem, r->page_size));
+
+ r->entries += v;
+ r->space_avail -= size;
+ r->first_free += size;
+}
+
+/*
+ * __rec_copy_incr --
+ * Copy a key/value cell and buffer pair into the new image.
+ */
+static inline void
+__rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv)
+{
+ size_t len;
+ uint8_t *p, *t;
+
+ /*
+ * If there's only one chunk of data to copy (because the cell and data
+ * are being copied from the original disk page), the cell length won't
+ * be set, the WT_ITEM data/length will reference the data to be copied.
+ *
+ * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do
+ * the copy in-line.
+ */
+ for (p = (uint8_t *)r->first_free,
+ t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len)
+ *p++ = *t++;
+
+ /* The data can be quite large -- call memcpy. */
+ if (kv->buf.size != 0)
+ memcpy(p, kv->buf.data, kv->buf.size);
+
+ WT_ASSERT(session, kv->len == kv->cell_len + kv->buf.size);
+ __rec_incr(session, r, 1, kv->len);
+}
+
+/*
+ * __rec_dict_replace --
+ * Check for a dictionary match.
+ */
+static int
+__rec_dict_replace(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, uint64_t rle, WT_KV *val)
+{
+ WT_DICTIONARY *dp;
+ uint64_t offset;
+
+ /*
+ * We optionally create a dictionary of values and only write a unique
+ * value once per page, using a special "copy" cell for all subsequent
+ * copies of the value. We have to do the cell build and resolution at
+ * this low level because we need physical cell offsets for the page.
+ *
+ * Sanity check: short-data cells can be smaller than dictionary-copy
+ * cells. If the data is already small, don't bother doing the work.
+ * This isn't just work avoidance: on-page cells can't grow as a result
+ * of writing a dictionary-copy cell, the reconciliation functions do a
+ * split-boundary test based on the size required by the value's cell;
+ * if we grow the cell after that test we'll potentially write off the
+ * end of the buffer's memory.
+ */
+ if (val->buf.size <= WT_INTPACK32_MAXSIZE)
+ return (0);
+ WT_RET(__rec_dictionary_lookup(session, r, val, &dp));
+ if (dp == NULL)
+ return (0);
+
+ /*
+ * If the dictionary cell reference is not set, we're creating a new
+ * entry in the dictionary, update its location.
+ *
+ * If the dictionary cell reference is set, we have a matching value.
+ * Create a copy cell instead.
+ */
+ if (dp->cell == NULL)
+ dp->cell = r->first_free;
+ else {
+ offset = WT_PTRDIFF(r->first_free, dp->cell);
+ val->len = val->cell_len =
+ __wt_cell_pack_copy(&val->cell, rle, offset);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ }
+ return (0);
+}
+
+/*
+ * __rec_key_state_update --
+ * Update prefix and suffix compression based on the last key.
+ */
+static inline void
+__rec_key_state_update(WT_RECONCILE *r, int ovfl_key)
+{
+ WT_ITEM *a;
+
+ /*
+ * If writing an overflow key onto the page, don't update the "last key"
+ * value, and leave the state of prefix compression alone. (If we are
+ * currently doing prefix compression, we have a key state which will
+ * continue to work, we're just skipping the key just created because
+ * it's an overflow key and doesn't participate in prefix compression.
+ * If we are not currently doing prefix compression, we can't start, an
+ * overflow key doesn't give us any state.)
+ *
+ * Additionally, if we wrote an overflow key onto the page, turn off the
+ * suffix compression of row-store internal node keys. (When we split,
+ * "last key" is the largest key on the previous page, and "cur key" is
+ * the first key on the next page, which is being promoted. In some
+ * cases we can discard bytes from the "cur key" that are not needed to
+ * distinguish between the "last key" and "cur key", compressing the
+ * size of keys on internal nodes. If we just built an overflow key,
+ * we're not going to update the "last key", making suffix compression
+ * impossible for the next key. Alternatively, we could remember where
+ * the last key was on the page, detect it's an overflow key, read it
+ * from disk and do suffix compression, but that's too much work for an
+ * unlikely event.)
+ *
+ * If we're not writing an overflow key on the page, update the last-key
+ * value and turn on both prefix and suffix compression.
+ */
+ if (ovfl_key)
+ r->key_sfx_compress = 0;
+ else {
+ a = r->cur;
+ r->cur = r->last;
+ r->last = a;
+
+ r->key_pfx_compress = r->key_pfx_compress_conf;
+ r->key_sfx_compress = r->key_sfx_compress_conf;
+ }
+}
+
+/*
+ * Macros from fixed-length entries to/from bytes.
+ */
+#define WT_FIX_BYTES_TO_ENTRIES(btree, bytes) \
+ ((uint32_t)((((bytes) * 8) / (btree)->bitcnt)))
+#define WT_FIX_ENTRIES_TO_BYTES(btree, entries) \
+ ((uint32_t)WT_ALIGN((entries) * (btree)->bitcnt, 8))
+
+/*
+ * __rec_leaf_page_max --
+ * Figure out the maximum leaf page size for the reconciliation.
+ */
+static inline uint32_t
+__rec_leaf_page_max(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BTREE *btree;
+ WT_PAGE *page;
+ uint32_t page_size;
+
+ btree = S2BT(session);
+ page = r->page;
+
+ page_size = 0;
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ /*
+ * Column-store pages can grow if there are missing records
+ * (that is, we lost a chunk of the range, and have to write
+ * deleted records). Fixed-length objects are a problem, if
+ * there's a big missing range, we could theoretically have to
+ * write large numbers of missing objects.
+ */
+ page_size = (uint32_t)WT_ALIGN(WT_FIX_ENTRIES_TO_BYTES(btree,
+ r->salvage->take + r->salvage->missing), btree->allocsize);
+ break;
+ case WT_PAGE_COL_VAR:
+ /*
+ * Column-store pages can grow if there are missing records
+ * (that is, we lost a chunk of the range, and have to write
+ * deleted records). Variable-length objects aren't usually a
+ * problem because we can write any number of deleted records
+ * in a single page entry because of the RLE, we just need to
+ * ensure that additional entry fits.
+ */
+ break;
+ case WT_PAGE_ROW_LEAF:
+ default:
+ /*
+ * Row-store pages can't grow, salvage never does anything
+ * other than reduce the size of a page read from disk.
+ */
+ break;
+ }
+
+ /*
+ * Default size for variable-length column-store and row-store pages
+ * during salvage is the maximum leaf page size.
+ */
+ if (page_size < btree->maxleafpage)
+ page_size = btree->maxleafpage;
+
+ /*
+ * The page we read from the disk should be smaller than the page size
+ * we just calculated, check out of paranoia.
+ */
+ if (page_size < page->dsk->mem_size)
+ page_size = page->dsk->mem_size;
+
+ /*
+ * Salvage is the backup plan: don't let this fail.
+ */
+ return (page_size * 2);
+}
+
+/*
+ * __rec_split_bnd_init --
+ * Initialize a single boundary structure.
+ */
+static void
+__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
+{
+ bnd->start = NULL;
+
+ bnd->recno = 0;
+ bnd->entries = 0;
+
+ __wt_free(session, bnd->addr.addr);
+ WT_CLEAR(bnd->addr);
+ bnd->size = 0;
+ bnd->cksum = 0;
+ __wt_free(session, bnd->dsk);
+
+ __wt_free(session, bnd->skip);
+ bnd->skip_next = 0;
+ bnd->skip_allocated = 0;
+
+ /* Ignore the key, we re-use that memory in each new reconciliation. */
+
+ bnd->already_compressed = 0;
+}
+
+/*
+ * __rec_split_bnd_grow --
+ * Grow the boundary array as necessary.
+ */
+static int
+__rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ /*
+ * Make sure there's enough room for another boundary. The calculation
+ * is +2, because when filling in the current boundary's information,
+ * we save the start point of the next boundary (for example, a record
+ * number or key), in the (current + 1) slot.
+ *
+ * For the same reason, we're always initializing one ahead.
+ */
+ WT_RET(__wt_realloc_def(
+ session, &r->bnd_allocated, r->bnd_next + 2, &r->bnd));
+ r->bnd_entries = r->bnd_allocated / sizeof(r->bnd[0]);
+
+ __rec_split_bnd_init(session, &r->bnd[r->bnd_next + 1]);
+
+ return (0);
+}
+
+/*
+ * __rec_split_init --
+ * Initialization for the reconciliation split functions.
+ */
+static int
+__rec_split_init(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, uint64_t recno, uint32_t max)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_PAGE_HEADER *dsk;
+ size_t corrected_page_size;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /*
+ * The maximum leaf page size governs when an in-memory leaf page splits
+ * into multiple on-disk pages; however, salvage can't be allowed to
+ * split, there's no parent page yet. If we're doing salvage, override
+ * the caller's selection of a maximum page size, choosing a page size
+ * that ensures we won't split.
+ */
+ if (r->salvage != NULL)
+ max = __rec_leaf_page_max(session, r);
+
+ /*
+ * Set the page sizes. If we're doing the page layout, the maximum page
+ * size is the same as the page size. If the application is doing page
+ * layout (raw compression is configured), we accumulate some amount of
+ * additional data because we don't know how well it will compress, and
+ * we don't want to increment our way up to the amount of data needed by
+ * the application to successfully compress to the target page size.
+ */
+ r->page_size = r->page_size_max = max;
+ if (r->raw_compression)
+ r->page_size *= 10;
+
+ /*
+ * Ensure the disk image buffer is large enough for the max object, as
+ * corrected by the underlying block manager.
+ */
+ corrected_page_size = r->page_size;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_init(session, &r->dsk, corrected_page_size));
+
+ /*
+ * Clear the disk page's header and block-manager space, set the page
+ * type (the type doesn't change, and setting it later would require
+ * additional code in a few different places).
+ */
+ dsk = r->dsk.mem;
+ memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree));
+ dsk->type = page->type;
+
+ /*
+ * If we have to split, we want to choose a smaller page size for the
+ * split pages, because otherwise we could end up splitting one large
+ * packed page over and over. We don't want to pick the minimum size
+ * either, because that penalizes an application that did a bulk load
+ * and subsequently inserted a few items into packed pages. Currently
+ * defaulted to 75%, but I have no empirical evidence that's "correct".
+ *
+ * The maximum page size may be a multiple of the split page size (for
+ * example, there's a maximum page size of 128KB, but because the table
+ * is active and we don't want to split a lot, the split size is 20KB).
+ * The maximum page size may NOT be an exact multiple of the split page
+ * size.
+ *
+ * It's lots of work to build these pages and don't want to start over
+ * when we reach the maximum page size (it's painful to restart after
+ * creating overflow items and compacted data, for example, as those
+ * items have already been written to disk). So, the loop calls the
+ * helper functions when approaching a split boundary, and we save the
+ * information at that point. That allows us to go back and split the
+ * page at the boundary points if we eventually overflow the maximum
+ * page size.
+ *
+ * Finally, all this doesn't matter for fixed-size column-store pages,
+ * raw compression, and salvage. Fixed-size column store pages can
+ * split under (very) rare circumstances, but they're allocated at a
+ * fixed page size, never anything smaller. In raw compression, the
+ * underlying compression routine decides when we split, so it's not
+ * our problem. In salvage, as noted above, we can't split at all.
+ */
+ if (r->raw_compression || r->salvage != NULL) {
+ r->split_size = 0;
+ r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ }
+ else if (page->type == WT_PAGE_COL_FIX) {
+ r->split_size = r->page_size_max;
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ } else {
+ r->split_size = __wt_split_page_size(btree, r->page_size_max);
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ }
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+ /* Initialize the first boundary. */
+ r->bnd_next = 0;
+ WT_RET(__rec_split_bnd_grow(session, r));
+ __rec_split_bnd_init(session, &r->bnd[0]);
+ r->bnd[0].recno = recno;
+ r->bnd[0].start = WT_PAGE_HEADER_BYTE(btree, dsk);
+
+ /*
+ * If the maximum page size is the same as the split page size, either
+ * because of the object type or application configuration, there isn't
+ * any need to maintain split boundaries within a larger page.
+ *
+ * No configuration for salvage here, because salvage can't split.
+ */
+ if (r->raw_compression)
+ r->bnd_state = SPLIT_TRACKING_RAW;
+ else if (max == r->split_size)
+ r->bnd_state = SPLIT_TRACKING_OFF;
+ else
+ r->bnd_state = SPLIT_BOUNDARY;
+
+ /* Initialize the entry counters. */
+ r->entries = r->total_entries = 0;
+
+ /* Initialize the starting record number. */
+ r->recno = recno;
+
+ /* New page, compression off. */
+ r->key_pfx_compress = r->key_sfx_compress = 0;
+
+ return (0);
+}
+
+/*
+ * __rec_is_checkpoint --
+ * Return if we're writing a checkpoint.
+ */
+static int
+__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+{
+ /*
+ * Check to see if we're going to create a checkpoint.
+ *
+ * This function exists as a place to hang this comment.
+ *
+ * Any time we write the root page of the tree without splitting we are
+ * creating a checkpoint (and have to tell the underlying block manager
+ * so it creates and writes the additional information checkpoints
+ * require). However, checkpoints are completely consistent, and so we
+ * have to resolve information about the blocks we're expecting to free
+ * as part of the checkpoint, before writing the checkpoint. In short,
+ * we don't do checkpoint writes here; clear the boundary information as
+ * a reminder and create the checkpoint during wrapup.
+ */
+ if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+ bnd->addr.addr = NULL;
+ bnd->addr.size = 0;
+ bnd->addr.type = 0;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __rec_split_row_promote_cell --
+ * Get a key from a cell for the purposes of promotion.
+ */
+static int
+__rec_split_row_promote_cell(
+ WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, WT_ITEM *key)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *kpack, _kpack;
+
+ btree = S2BT(session);
+ kpack = &_kpack;
+
+ /*
+ * The cell had better have a zero-length prefix and not be a copy cell;
+ * the first cell on a page cannot refer an earlier cell on the page.
+ */
+ cell = WT_PAGE_HEADER_BYTE(btree, dsk);
+ __wt_cell_unpack(cell, kpack);
+ WT_ASSERT(session,
+ kpack->prefix == 0 && kpack->raw != WT_CELL_VALUE_COPY);
+
+ WT_RET(__wt_cell_data_copy(session, dsk->type, kpack, key));
+ return (0);
+}
+
+/*
+ * __rec_split_row_promote --
+ * Key promotion for a row-store.
+ */
+static int
+__rec_split_row_promote(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, uint8_t type)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(update);
+ WT_DECL_RET;
+ WT_ITEM *max;
+ WT_UPD_SKIPPED *skip;
+ size_t cnt, len, size;
+ uint32_t i;
+ const uint8_t *pa, *pb;
+ int cmp;
+
+ /*
+ * For a column-store, the promoted key is the recno and we already have
+ * a copy. For a row-store, it's the first key on the page, a variable-
+ * length byte string, get a copy.
+ *
+ * This function is called from the split code at each split boundary,
+ * but that means we're not called before the first boundary, and we
+ * will eventually have to get the first key explicitly when splitting
+ * a page.
+ *
+ * For the current slot, take the last key we built, after doing suffix
+ * compression. The "last key we built" describes some process: before
+ * calling the split code, we must place the last key on the page before
+ * the boundary into the "last" key structure, and the first key on the
+ * page after the boundary into the "current" key structure, we're going
+ * to compare them for suffix compression.
+ *
+ * Suffix compression is a hack to shorten keys on internal pages. We
+ * only need enough bytes in the promoted key to ensure searches go to
+ * the correct page: the promoted key has to be larger than the last key
+ * on the leaf page preceding it, but we don't need any more bytes than
+ * that. In other words, we can discard any suffix bytes not required
+ * to distinguish between the key being promoted and the last key on the
+ * leaf page preceding it. This can only be done for the first level of
+ * internal pages, you cannot repeat suffix truncation as you split up
+ * the tree, it loses too much information.
+ *
+ * Note #1: if the last key on the previous page was an overflow key,
+ * we don't have the in-memory key against which to compare, and don't
+ * try to do suffix compression. The code for that case turns suffix
+ * compression off for the next key, we don't have to deal with it here.
+ */
+ if (type != WT_PAGE_ROW_LEAF || !r->key_sfx_compress)
+ return (__wt_buf_set(session, key, r->cur->data, r->cur->size));
+
+ btree = S2BT(session);
+ WT_RET(__wt_scr_alloc(session, 0, &update));
+
+ /*
+ * Note #2: if we skipped updates, an update key may be larger than the
+ * last key stored in the previous block (probable for append-centric
+ * workloads). If there are skipped updates, check for one larger than
+ * the last key and smaller than the current key.
+ */
+ max = r->last;
+ for (i = r->skip_next; i > 0; --i) {
+ skip = &r->skip[i - 1];
+ if (skip->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, r->page, skip->rip, update, 0));
+ else {
+ update->data = WT_INSERT_KEY(skip->ins);
+ update->size = WT_INSERT_KEY_SIZE(skip->ins);
+ }
+
+ /* Compare against the current key, it must be less. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->cur, &cmp));
+ if (cmp >= 0)
+ continue;
+
+ /* Compare against the last key, it must be greater. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->last, &cmp));
+ if (cmp >= 0)
+ max = update;
+
+ /*
+ * The skipped updates are in key-sort order so the entry we're
+ * looking for is either the last one or the next-to-last one
+ * in the list. Once we've compared an entry against the last
+ * key on the page, we're done.
+ */
+ break;
+ }
+
+ /*
+ * The largest key on the last block must sort before the current key,
+ * so we'll either find a larger byte value in the current key, or the
+ * current key will be a longer key, and the interesting byte is one
+ * past the length of the shorter key.
+ */
+ pa = max->data;
+ pb = r->cur->data;
+ len = WT_MIN(max->size, r->cur->size);
+ size = len + 1;
+ for (cnt = 1; len > 0; ++cnt, --len, ++pa, ++pb)
+ if (*pa != *pb) {
+ if (size != cnt) {
+ WT_STAT_FAST_DATA_INCRV(session,
+ rec_suffix_compression, size - cnt);
+ size = cnt;
+ }
+ break;
+ }
+ ret = __wt_buf_set(session, key, r->cur->data, size);
+
+err: __wt_scr_free(&update);
+ return (ret);
+}
+
+/*
+ * __rec_split --
+ * Handle the page reconciliation bookkeeping. (Did you know "bookkeeper"
+ * has 3 doubled letters in a row? Sweet-tooth does, too.)
+ */
+static int
+__rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BTREE *btree;
+ WT_BOUNDARY *last, *next;
+ WT_PAGE_HEADER *dsk;
+ uint32_t len;
+
+ /*
+ * We should never split during salvage, and we're about to drop core
+ * because there's no parent page.
+ */
+ if (r->salvage != NULL)
+ WT_PANIC_RET(session, WT_PANIC,
+ "%s page too large, attempted split during salvage",
+ __wt_page_type_string(r->page->type));
+
+ /*
+ * Handle page-buffer size tracking; we have to do this work in every
+ * reconciliation loop, and I don't want to repeat the code that many
+ * times.
+ */
+ btree = S2BT(session);
+ dsk = r->dsk.mem;
+
+ /* Hitting a page boundary resets the dictionary, in all cases. */
+ __rec_dictionary_reset(r);
+
+ /*
+ * There are 3 cases we have to handle.
+ *
+ * #1
+ * About to cross a split boundary: save current boundary information
+ * and return.
+ *
+ * #2
+ * About to cross the maximum boundary: use saved boundary information
+ * to write all of the split pages.
+ *
+ * #3
+ * About to cross a split boundary, but we've either already done the
+ * split thing when we approached the maximum boundary, in which
+ * case we write the page and keep going, or we were never tracking
+ * split boundaries at all.
+ *
+ * Cases #1 and #2 are the hard ones: we're called when we're about to
+ * cross each split boundary, and we save information away so we can
+ * split if we have to. We're also called when we're about to cross
+ * the maximum page boundary: in that case, we do the actual split and
+ * clean up all the previous boundaries, then keep going.
+ */
+ switch (r->bnd_state) {
+ case SPLIT_BOUNDARY: /* Case #1 */
+ /*
+ * Save the information about where we are when the split would
+ * have happened.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next++];
+ next = last + 1;
+
+ /* Set the number of entries for the just finished chunk. */
+ last->entries = r->entries - r->total_entries;
+ r->total_entries = r->entries;
+
+ /* Set the key for the next chunk. */
+ next->recno = r->recno;
+ if (dsk->type == WT_PAGE_ROW_INT ||
+ dsk->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__rec_split_row_promote(
+ session, r, &next->key, dsk->type));
+
+ /*
+ * Set the starting buffer address and clear the entries (the
+ * latter not required, but cleaner).
+ */
+ next->start = r->first_free;
+ next->entries = 0;
+
+ /*
+ * Set the space available to another split-size chunk, if we
+ * have one. If we don't have room for another split chunk,
+ * add whatever space remains in the maximum page size, and
+ * hope it's enough.
+ */
+ len = WT_PTRDIFF32(r->first_free, dsk);
+ if (len + r->split_size <= r->page_size)
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ else {
+ r->bnd_state = SPLIT_MAX;
+ r->space_avail = r->page_size -
+ (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+ }
+ break;
+ case SPLIT_MAX: /* Case #2 */
+ /*
+ * It didn't all fit into a single page.
+ *
+ * Cycle through the saved split-point information, writing the
+ * split chunks we have tracked.
+ */
+ WT_RET(__rec_split_fixup(session, r));
+
+ /* We're done saving split chunks. */
+ r->bnd_state = SPLIT_TRACKING_OFF;
+ break;
+ case SPLIT_TRACKING_OFF: /* Case #3 */
+ /*
+ * It didn't all fit, but either we've already noticed it and
+ * are now processing the rest of the page at the split-size
+ * boundaries, or the split size was the same as the page size,
+ * so we never bothered with saving split-point information.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next++];
+ next = last + 1;
+
+ /*
+ * Set the key for the next chunk (before writing the block, a
+ * key range is needed in that code).
+ */
+ next->recno = r->recno;
+ if (dsk->type == WT_PAGE_ROW_INT ||
+ dsk->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__rec_split_row_promote(
+ session, r, &next->key, dsk->type));
+
+ /* Clear the entries (not required, but cleaner). */
+ next->entries = 0;
+
+ /* Finalize the header information and write the page. */
+ dsk->recno = last->recno;
+ dsk->u.entries = r->entries;
+ dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+ WT_RET(__rec_split_write(session, r, last, &r->dsk, 0));
+
+ /*
+ * Set the caller's entry count and buffer information for the
+ * next chunk. We only get here if we're not splitting or have
+ * already split, so it's split-size chunks from here on out.
+ */
+ r->entries = 0;
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+ r->space_avail =
+ r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+ break;
+ case SPLIT_TRACKING_RAW:
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * __rec_split_raw_worker --
+ * Handle the raw compression page reconciliation bookkeeping.
+ */
+static int
+__rec_split_raw_worker(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, int no_more_rows)
+{
+ WT_BM *bm;
+ WT_BOUNDARY *last, *next;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_COMPRESSOR *compressor;
+ WT_DECL_RET;
+ WT_ITEM *dst, *write_ref;
+ WT_PAGE_HEADER *dsk, *dsk_dst;
+ WT_SESSION *wt_session;
+ size_t corrected_page_size, len, result_len;
+ uint64_t recno;
+ uint32_t entry, i, result_slots, slots;
+ int last_block;
+ uint8_t *dsk_start;
+
+ wt_session = (WT_SESSION *)session;
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ unpack = &_unpack;
+ compressor = btree->compressor;
+ dst = &r->raw_destination;
+ dsk = r->dsk.mem;
+
+ WT_RET(__rec_split_bnd_grow(session, r));
+ last = &r->bnd[r->bnd_next];
+ next = last + 1;
+
+ /*
+ * Build arrays of offsets and cumulative counts of cells and rows in
+ * the page: the offset is the byte offset to the possible split-point
+ * (adjusted for an initial chunk that cannot be compressed), entries
+ * is the cumulative page entries covered by the byte offset, recnos is
+ * the cumulative rows covered by the byte offset.
+ */
+ if (r->entries >= r->raw_max_slots) {
+ __wt_free(session, r->raw_entries);
+ __wt_free(session, r->raw_offsets);
+ __wt_free(session, r->raw_recnos);
+ r->raw_max_slots = 0;
+
+ i = r->entries + 100;
+ WT_RET(__wt_calloc_def(session, i, &r->raw_entries));
+ WT_RET(__wt_calloc_def(session, i, &r->raw_offsets));
+ if (dsk->type == WT_PAGE_COL_INT ||
+ dsk->type == WT_PAGE_COL_VAR)
+ WT_RET(__wt_calloc_def(session, i, &r->raw_recnos));
+ r->raw_max_slots = i;
+ }
+
+ /*
+ * We're going to walk the disk image, which requires setting the
+ * number of entries.
+ */
+ dsk->u.entries = r->entries;
+
+ /*
+ * We track the record number at each column-store split point, set an
+ * initial value.
+ */
+ recno = 0;
+ if (dsk->type == WT_PAGE_COL_VAR)
+ recno = last->recno;
+
+ entry = slots = 0;
+ WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
+ ++entry;
+
+ /*
+ * Row-store pages can split at keys, but not at values,
+ * column-store pages can split at values.
+ */
+ __wt_cell_unpack(cell, unpack);
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_SHORT:
+ break;
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_DEL:
+ case WT_CELL_VALUE:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_SHORT:
+ if (dsk->type == WT_PAGE_COL_INT) {
+ recno = unpack->v;
+ break;
+ }
+ if (dsk->type == WT_PAGE_COL_VAR) {
+ recno += __wt_cell_rle(unpack);
+ break;
+ }
+ r->raw_entries[slots] = entry;
+ continue;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * We can't compress the first 64B of the block (it must be
+ * written without compression), and a possible split point
+ * may appear in that 64B; keep it simple, ignore the first
+ * allocation size of data, anybody splitting smaller than
+ * that (as calculated before compression), is doing it wrong.
+ */
+ if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
+ r->raw_offsets[++slots] =
+ WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
+
+ if (dsk->type == WT_PAGE_COL_INT ||
+ dsk->type == WT_PAGE_COL_VAR)
+ r->raw_recnos[slots] = recno;
+ r->raw_entries[slots] = entry;
+ }
+
+ /*
+ * If we haven't managed to find at least one split point, we're done,
+ * don't bother calling the underlying compression function.
+ */
+ if (slots == 0) {
+ result_len = 0;
+ result_slots = 0;
+ goto no_slots;
+ }
+
+ /* The slot at array's end is the total length of the data. */
+ r->raw_offsets[++slots] =
+ WT_STORE_SIZE(WT_PTRDIFF(cell, dsk) - WT_BLOCK_COMPRESS_SKIP);
+
+ /*
+ * Allocate a destination buffer. If there's a pre-size function, use
+ * it to determine the destination buffer's minimum size, otherwise the
+ * destination buffer is documented to be at least the maximum object
+ * size.
+ *
+ * The destination buffer really only needs to be large enough for the
+ * target block size, corrected for the requirements of the underlying
+ * block manager. If the target block size is 8KB, that's a multiple
+ * of 512B and so the underlying block manager is fine with it. But...
+ * we don't control what the pre_size method returns us as a required
+ * size, and we don't want to document the compress_raw method has to
+ * skip bytes in the buffer because that's confusing, so do something
+ * more complicated. First, find out how much space the compress_raw
+ * function might need, either the value returned from pre_size, or the
+ * maximum object size. Add the compress-skip bytes, and then correct
+ * that value for the underlying block manager. As a result, we have
+ * a destination buffer that's the right "object" size when calling the
+ * compress_raw method, and there are bytes in the header just for us.
+ */
+ if (compressor->pre_size == NULL)
+ result_len = r->page_size_max;
+ else
+ WT_RET(compressor->pre_size(compressor, wt_session,
+ (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ (size_t)r->raw_offsets[slots], &result_len));
+ corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_init(session, dst, corrected_page_size));
+
+ /*
+ * Copy the header bytes into the destination buffer, then call the
+ * compression function.
+ */
+ memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
+ ret = compressor->compress_raw(compressor, wt_session,
+ r->page_size_max, btree->split_pct,
+ WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ r->raw_offsets, slots,
+ (uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+ result_len, no_more_rows, &result_len, &result_slots);
+ switch (ret) {
+ case EAGAIN:
+ /*
+ * The compression function wants more rows; accumulate and
+ * retry.
+ *
+ * Reset the resulting slots count, just in case the compression
+ * function modified it before giving up.
+ */
+ result_slots = 0;
+ break;
+ case 0:
+ /*
+ * If the compression function returned zero result slots, it's
+ * giving up and we write the original data. (This is a pretty
+ * bad result: we've not done compression on a block much larger
+ * than the maximum page size, but once compression gives up,
+ * there's not much else we can do.)
+ *
+ * If the compression function returned non-zero result slots,
+ * we were successful and have a block to write.
+ */
+ if (result_slots == 0) {
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+ /*
+ * If there are no more rows, we can write the original
+ * data from the original buffer.
+ */
+ if (no_more_rows)
+ break;
+
+ /*
+ * Copy the original data to the destination buffer, as
+ * if the compression function simply copied it. Take
+ * all but the last row of the original data (the last
+ * row has to be set as the key for the next block).
+ */
+ result_slots = slots - 1;
+ result_len = r->raw_offsets[result_slots];
+ WT_RET(__wt_buf_grow(
+ session, dst, result_len + WT_BLOCK_COMPRESS_SKIP));
+ memcpy((uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
+ result_len);
+
+ /*
+ * Mark it as uncompressed so the standard compression
+ * function is called before the buffer is written.
+ */
+ last->already_compressed = 0;
+ } else {
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_ok);
+
+ /*
+ * If there are more rows and the compression function
+ * consumed all of the current data, there are problems:
+ * First, with row-store objects, we're potentially
+ * skipping updates, we must have a key for the next
+ * block so we know with what block a skipped update is
+ * associated. Second, if the compression function
+ * compressed all of the data, we're not pushing it
+ * hard enough (unless we got lucky and gave it exactly
+ * the right amount to work with, which is unlikely).
+ * Handle both problems by accumulating more data any
+ * time we're not writing the last block and compression
+ * ate all of the rows.
+ */
+ if (result_slots == slots && !no_more_rows)
+ result_slots = 0;
+ else
+ last->already_compressed = 1;
+ }
+ break;
+ default:
+ return (ret);
+ }
+
+no_slots:
+ /*
+ * Check for the last block we're going to write: if no more rows and
+ * we failed to compress anything, or we compressed everything, it's
+ * the last block.
+ */
+ last_block = no_more_rows &&
+ (result_slots == 0 || result_slots == slots);
+
+ if (result_slots != 0) {
+ /*
+ * We have a block, finalize the header information.
+ */
+ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP;
+ dsk_dst = dst->mem;
+ dsk_dst->recno = last->recno;
+ dsk_dst->mem_size =
+ r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP;
+ dsk_dst->u.entries = r->raw_entries[result_slots - 1];
+
+ /*
+ * There is likely a remnant in the working buffer that didn't
+ * get compressed; copy it down to the start of the buffer and
+ * update the starting record number, free space and so on.
+ * !!!
+ * Note use of memmove, the source and destination buffers can
+ * overlap.
+ */
+ len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
+ r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
+
+ r->entries -= r->raw_entries[result_slots - 1];
+ r->first_free = dsk_start + len;
+ r->space_avail =
+ r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+
+ /*
+ * Set the key for the next block (before writing the block, a
+ * key range is needed in that code).
+ */
+ switch (dsk->type) {
+ case WT_PAGE_COL_INT:
+ next->recno = r->raw_recnos[result_slots];
+ break;
+ case WT_PAGE_COL_VAR:
+ next->recno = r->raw_recnos[result_slots - 1];
+ break;
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ next->recno = 0;
+ if (!last_block) {
+ /*
+ * Confirm there was uncompressed data remaining
+ * in the buffer, we're about to read it for the
+ * next chunk's initial key.
+ */
+ WT_ASSERT(session, len > 0);
+ WT_RET(__rec_split_row_promote_cell(
+ session, dsk, &next->key));
+ }
+ break;
+ }
+ write_ref = dst;
+ } else if (no_more_rows) {
+ /*
+ * Compression failed and there are no more rows to accumulate,
+ * write the original buffer instead.
+ */
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail);
+
+ dsk->recno = last->recno;
+ dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+ dsk->u.entries = r->entries;
+
+ r->entries = 0;
+ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk);
+ r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree);
+
+ write_ref = &r->dsk;
+ last->already_compressed = 0;
+ } else {
+ /*
+ * Compression failed, there are more rows to accumulate and the
+ * compression function wants to try again; increase the size of
+ * the "page" and try again after we accumulate some more rows.
+ */
+ WT_STAT_FAST_DATA_INCR(session, compress_raw_fail_temporary);
+
+ len = WT_PTRDIFF(r->first_free, r->dsk.mem);
+ corrected_page_size = r->page_size * 2;
+ WT_RET(bm->write_size(bm, session, &corrected_page_size));
+ WT_RET(__wt_buf_grow(session, &r->dsk, corrected_page_size));
+ r->page_size *= 2;
+ r->first_free = (uint8_t *)r->dsk.mem + len;
+ r->space_avail =
+ r->page_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len);
+ return (0);
+ }
+
+ /* We have a block, update the boundary counter. */
+ ++r->bnd_next;
+
+ /*
+ * If we are writing the whole page in our first/only attempt, it might
+ * be a checkpoint (checkpoints are only a single page, by definition).
+ * Further, checkpoints aren't written here, the wrapup functions do the
+ * write, and they do the write from the original buffer location. If
+ * it's a checkpoint and the block isn't in the right buffer, copy it.
+ *
+ * If it's not a checkpoint, write the block.
+ */
+ if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+ if (write_ref == dst)
+ WT_RET(__wt_buf_set(
+ session, &r->dsk, dst->mem, dst->size));
+ } else
+ WT_RET(
+ __rec_split_write(session, r, last, write_ref, last_block));
+ return (0);
+}
+
+/*
+ * __rec_raw_decompress --
+ * Decompress a raw-compressed image.
+ */
+static int
+__rec_raw_decompress(
+ WT_SESSION_IMPL *session, const void *image, size_t size, void *retp)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER const *dsk;
+ size_t result_len;
+
+ btree = S2BT(session);
+ dsk = image;
+
+ /*
+ * We skipped an update and we can't write a block, but unfortunately,
+ * the block has already been compressed. Decompress the block so we
+ * can subsequently re-instantiate it in memory.
+ */
+ WT_RET(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+ memcpy(tmp->mem, image, WT_BLOCK_COMPRESS_SKIP);
+ WT_ERR(btree->compressor->decompress(btree->compressor,
+ &session->iface,
+ (uint8_t *)image + WT_BLOCK_COMPRESS_SKIP,
+ size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
+ dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
+ &result_len));
+ if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+ WT_ERR(__wt_illegal_value(session, btree->dhandle->name));
+
+ WT_ERR(__wt_strndup(session, tmp->data, dsk->mem_size, retp));
+ WT_ASSERT(session, __wt_verify_dsk_image(
+ session, "[raw evict split]", tmp->data, dsk->mem_size) == 0);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __rec_split_raw --
+ * Raw compression split routine.
+ */
+static inline int
+__rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ return (__rec_split_raw_worker(session, r, 0));
+}
+
+/*
+ * __rec_split_finish_std --
+ * Finish processing a page, standard version.
+ */
+static int
+__rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BOUNDARY *bnd;
+ WT_PAGE_HEADER *dsk;
+
+ /* Adjust the boundary information based on our split status. */
+ switch (r->bnd_state) {
+ case SPLIT_BOUNDARY:
+ case SPLIT_MAX:
+ /*
+ * We never split, the reconciled page fit into a maximum page
+ * size. Change the first boundary slot to represent the full
+ * page (the first boundary slot is largely correct, just update
+ * the number of entries).
+ */
+ r->bnd_next = 0;
+ break;
+ case SPLIT_TRACKING_OFF:
+ /*
+ * If we have already split, or aren't tracking boundaries, put
+ * the remaining data in the next boundary slot.
+ */
+ WT_RET(__rec_split_bnd_grow(session, r));
+ break;
+ case SPLIT_TRACKING_RAW:
+ /*
+ * We were configured for raw compression, but never actually
+ * wrote anything.
+ */
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * We only arrive here with no entries to write if the page was entirely
+ * empty, and if the page is empty, we merge it into its parent during
+ * the parent's reconciliation. A page with skipped updates isn't truly
+ * empty, continue on.
+ */
+ if (r->entries == 0 && r->skip_next == 0)
+ return (0);
+
+ /* Set the boundary reference and increment the count. */
+ bnd = &r->bnd[r->bnd_next++];
+ bnd->entries = r->entries;
+
+ /* Finalize the header information. */
+ dsk = r->dsk.mem;
+ dsk->recno = bnd->recno;
+ dsk->u.entries = r->entries;
+ dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
+
+ /* If this is a checkpoint, we're done, otherwise write the page. */
+ return (
+ __rec_is_checkpoint(r, bnd) ? 0 :
+ __rec_split_write(session, r, bnd, &r->dsk, 1));
+}
+
+/*
+ * __rec_split_finish --
+ * Finish processing a page.
+ */
+static int
+__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ /* We're done reconciling - write the final page */
+ if (r->raw_compression && r->entries != 0) {
+ while (r->entries != 0)
+ WT_RET(__rec_split_raw_worker(session, r, 1));
+ } else
+ WT_RET(__rec_split_finish_std(session, r));
+
+ return (0);
+}
+
+/*
+ * __rec_split_fixup --
+ * Fix up after crossing the maximum page boundary.
+ */
+static int
+__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_BOUNDARY *bnd;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER *dsk;
+ uint32_t i, len;
+ uint8_t *dsk_start;
+
+ /*
+ * When we overflow physical limits of the page, we walk the list of
+ * split chunks we've created and write those pages out, then update
+ * the caller's information.
+ */
+ btree = S2BT(session);
+
+ /*
+ * The data isn't laid out on a page boundary or nul padded; copy it to
+ * a clean, aligned, padded buffer before writing it.
+ *
+ * Allocate a scratch buffer to hold the new disk image. Copy the
+ * WT_PAGE_HEADER header onto the scratch buffer, most of the header
+ * information remains unchanged between the pages.
+ */
+ WT_RET(__wt_scr_alloc(session, r->page_size_max, &tmp));
+ dsk = tmp->mem;
+ memcpy(dsk, r->dsk.mem, WT_PAGE_HEADER_SIZE);
+
+ /*
+ * For each split chunk we've created, update the disk image and copy
+ * it into place.
+ */
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) {
+ /* Copy the page contents to the temporary buffer. */
+ len = WT_PTRDIFF32((bnd + 1)->start, bnd->start);
+ memcpy(dsk_start, bnd->start, len);
+
+ /* Finalize the header information and write the page. */
+ dsk->recno = bnd->recno;
+ dsk->u.entries = bnd->entries;
+ dsk->mem_size =
+ tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len;
+ WT_ERR(__rec_split_write(session, r, bnd, tmp, 0));
+ }
+
+ /*
+ * There is probably a remnant in the working buffer that didn't get
+ * written; copy it down to the beginning of the working buffer, and
+ * update the starting record number.
+ *
+ * Confirm the remnant is no larger than the available split buffer.
+ *
+ * Fix up our caller's information.
+ */
+ len = WT_PTRDIFF32(r->first_free, bnd->start);
+ if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree))
+ WT_PANIC_ERR(session, EINVAL,
+ "Reconciliation remnant too large for the split buffer");
+
+ dsk = r->dsk.mem;
+ dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
+ (void)memmove(dsk_start, bnd->start, len);
+
+ r->entries -= r->total_entries;
+ r->first_free = dsk_start + len;
+ r->space_avail =
+ (r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - len;
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __rec_split_write --
+ * Write a disk block out for the split helper functions.
+ */
+static int
+__rec_split_write(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_BOUNDARY *bnd, WT_ITEM *buf, int last_block)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_MULTI *multi;
+ WT_PAGE *page;
+ WT_PAGE_HEADER *dsk;
+ WT_PAGE_MODIFY *mod;
+ WT_UPD_SKIPPED *skip;
+ size_t addr_size;
+ uint32_t bnd_slot, i, j;
+ int cmp;
+ uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
+
+ btree = S2BT(session);
+ dsk = buf->mem;
+ page = r->page;
+ mod = page->modify;
+
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+
+ /* Set the zero-length value flag in the page header. */
+ if (dsk->type == WT_PAGE_ROW_LEAF) {
+ F_CLR(dsk, WT_PAGE_EMPTY_V_ALL | WT_PAGE_EMPTY_V_NONE);
+
+ if (r->entries != 0 && r->all_empty_value)
+ F_SET(dsk, WT_PAGE_EMPTY_V_ALL);
+ if (r->entries != 0 && !r->any_empty_value)
+ F_SET(dsk, WT_PAGE_EMPTY_V_NONE);
+ }
+
+ /* Initialize the address (set the page type for the parent). */
+ switch (dsk->type) {
+ case WT_PAGE_COL_FIX:
+ bnd->addr.type = WT_ADDR_LEAF_NO;
+ break;
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ bnd->addr.type = r->ovfl_items ? WT_ADDR_LEAF : WT_ADDR_LEAF_NO;
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ bnd->addr.type = WT_ADDR_INT;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ bnd->size = (uint32_t)buf->size;
+ bnd->cksum = 0;
+
+ /*
+ * Check if we've skipped updates that belong to this block, and move
+ * any to the per-block structure. Quit as soon as we find a skipped
+ * update that doesn't belong to the block, they're in sorted order.
+ *
+ * This code requires a key be filled in for the next block (or the
+ * last block flag be set, if there's no next block).
+ */
+ for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
+ /* The last block gets all remaining skipped updates. */
+ if (last_block) {
+ WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ continue;
+ }
+
+ /*
+ * Get the skipped update's key and compare it with this block's
+ * key range. If the skipped update list belongs with the block
+ * we're about to write, move it to the per-block memory. Check
+ * only to the first update that doesn't go with the block, they
+ * must be in sorted order.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
+ goto skip_check_complete;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (skip->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, page, skip->rip, key, 0));
+ else {
+ key->data = WT_INSERT_KEY(skip->ins);
+ key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ }
+ WT_ERR(__wt_compare(session,
+ btree->collator, key, &(bnd + 1)->key, &cmp));
+ if (cmp >= 0)
+ goto skip_check_complete;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ }
+
+skip_check_complete:
+ /*
+ * If there are updates that weren't moved to the block, shuffle them to
+ * the beginning of the cached list (we maintain the skipped updates in
+ * sorted order, new skipped updates must be appended to the list).
+ */
+ for (j = 0; i < r->skip_next; ++j, ++i)
+ r->skip[j] = r->skip[i];
+ r->skip_next = j;
+
+ /*
+ * If we had to skip updates in order to build this disk image, we can't
+ * actually write it. Instead, we will re-instantiate the page using the
+ * disk image and the list of updates we skipped.
+ *
+ * If the buffer is compressed (raw compression was configured), we have
+ * to decompress it so we can instantiate it later.
+ */
+ if (bnd->skip != NULL) {
+ if (bnd->already_compressed)
+ WT_ERR(__rec_raw_decompress(
+ session, buf->data, buf->size, &bnd->dsk));
+ else {
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &bnd->dsk));
+ WT_ASSERT(session, __wt_verify_dsk_image(session,
+ "[evict split]", buf->data, buf->size) == 0);
+ }
+ goto done;
+ }
+
+ /*
+ * If we wrote this block before, re-use it. Pages get written in the
+ * same block order every time, only check the appropriate slot. The
+ * expensive part of this test is the checksum, only do that work when
+ * there has been or will be a reconciliation of this page involving
+ * split pages. This test isn't perfect: we're doing a checksum if a
+ * previous reconciliation of the page split or if we will split this
+ * time, but that test won't calculate a checksum on the first block
+ * the first time the page splits.
+ */
+ bnd_slot = (uint32_t)(bnd - r->bnd);
+ if (bnd_slot > 1 ||
+ (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) && mod->mod_multi != NULL)) {
+ /*
+ * There are page header fields which need to be cleared to get
+ * consistent checksums: specifically, the write generation and
+ * the memory owned by the block manager. We are reusing the
+ * same buffer space each time, clear it before calculating the
+ * checksum.
+ */
+ dsk->write_gen = 0;
+ memset(WT_BLOCK_HEADER_REF(dsk), 0, btree->block_header);
+ bnd->cksum = __wt_cksum(buf->data, buf->size);
+
+ if (F_ISSET(mod, WT_PM_REC_MULTIBLOCK) &&
+ mod->mod_multi_entries > bnd_slot) {
+ multi = &mod->mod_multi[bnd_slot];
+ if (multi->size == bnd->size &&
+ multi->cksum == bnd->cksum) {
+ multi->addr.reuse = 1;
+ bnd->addr = multi->addr;
+
+ WT_STAT_FAST_DATA_INCR(session, rec_page_match);
+ goto done;
+ }
+ }
+ }
+
+ WT_ERR(__wt_bt_write(session,
+ buf, addr, &addr_size, 0, bnd->already_compressed));
+ WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
+ bnd->addr.size = (uint8_t)addr_size;
+
+done:
+err: __wt_scr_free(&key);
+ return (ret);
+}
+
+/*
+ * __wt_bulk_init --
+ * Bulk insert initialization.
+ */
+int
+__wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_PAGE_INDEX *pindex;
+ WT_RECONCILE *r;
+ uint64_t recno;
+
+ btree = S2BT(session);
+ /*
+ * Bulk-load is only permitted on newly created files, not any empty
+ * file -- see the checkpoint code for a discussion.
+ */
+ if (!btree->bulk_load_ok)
+ WT_RET_MSG(session, EINVAL,
+ "bulk-load is only possible for newly created trees");
+
+ /* Set a reference to the empty leaf page. */
+ pindex = WT_INTL_INDEX_COPY(btree->root.page);
+ cbulk->ref = pindex->index[0];
+ cbulk->leaf = cbulk->ref->page;
+
+ WT_RET(
+ __rec_write_init(session, cbulk->ref, 0, NULL, &cbulk->reconcile));
+ r = cbulk->reconcile;
+ r->is_bulk_load = 1;
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ case BTREE_COL_VAR:
+ recno = 1;
+ break;
+ case BTREE_ROW:
+ recno = 0;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (__rec_split_init(
+ session, r, cbulk->leaf, recno, btree->maxleafpage));
+}
+
+/*
+ * __wt_bulk_wrapup --
+ * Bulk insert cleanup.
+ */
+int
+__wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_PAGE *parent;
+ WT_RECONCILE *r;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ switch (btree->type) {
+ case BTREE_COL_FIX:
+ if (cbulk->entry != 0)
+ __rec_incr(session, r, cbulk->entry,
+ __bitstr_size(
+ (size_t)cbulk->entry * btree->bitcnt));
+ break;
+ case BTREE_COL_VAR:
+ if (cbulk->rle != 0)
+ WT_RET(__wt_bulk_insert_var(session, cbulk));
+ break;
+ case BTREE_ROW:
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ WT_RET(__rec_split_finish(session, r));
+ WT_RET(__rec_write_wrapup(session, r, r->page));
+
+ /* Mark the page's parent dirty. */
+ parent = r->ref->home;
+ WT_RET(__wt_page_modify_init(session, parent));
+ __wt_page_modify_set(session, parent);
+
+ __rec_destroy(session, &cbulk->reconcile);
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_row --
+ * Row-store bulk insert.
+ */
+int
+__wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_KV *key, *val;
+ WT_RECONCILE *r;
+ int ovfl_key;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ key = &r->k;
+ val = &r->v;
+ WT_RET(__rec_cell_build_leaf_key(session, r, /* Build key cell */
+ cursor->key.data, cursor->key.size, &ovfl_key));
+ WT_RET(__rec_cell_build_val(session, r, /* Build value cell */
+ cursor->value.data, cursor->value.size, (uint64_t)0));
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else {
+ WT_RET(__rec_split(session, r));
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless we're already working
+ * with an overflow key), rebuild the key without prefix
+ * compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_RET(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = 1;
+ else {
+ r->all_empty_value = 0;
+ if (btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, 0, val));
+ __rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+
+ return (0);
+}
+
+/*
+ * __rec_col_fix_bulk_insert_split_check --
+ * Check if a bulk-loaded fixed-length column store page needs to split.
+ */
+static inline int
+__rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_RECONCILE *r;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ if (cbulk->entry == cbulk->nrecs) {
+ if (cbulk->entry != 0) {
+ /*
+ * If everything didn't fit, update the counters and
+ * split.
+ *
+ * Boundary: split or write the page.
+ */
+ __rec_incr(session, r, cbulk->entry,
+ __bitstr_size(
+ (size_t)cbulk->entry * btree->bitcnt));
+ WT_RET(__rec_split(session, r));
+ }
+ cbulk->entry = 0;
+ cbulk->nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+ }
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_fix --
+ * Fixed-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_RECONCILE *r;
+ uint32_t entries, offset, page_entries, page_size;
+ const uint8_t *data;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+ cursor = &cbulk->cbt.iface;
+
+ if (cbulk->bitmap) {
+ if (((r->recno - 1) * btree->bitcnt) & 0x7)
+ WT_RET_MSG(session, EINVAL,
+ "Bulk bitmap load not aligned on a byte boundary");
+ for (data = cursor->value.data,
+ entries = (uint32_t)cursor->value.size;
+ entries > 0;
+ entries -= page_entries, data += page_size) {
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ page_entries =
+ WT_MIN(entries, cbulk->nrecs - cbulk->entry);
+ page_size = __bitstr_size(page_entries * btree->bitcnt);
+ offset = __bitstr_size(cbulk->entry * btree->bitcnt);
+ memcpy(r->first_free + offset, data, page_size);
+ cbulk->entry += page_entries;
+ r->recno += page_entries;
+ }
+ return (0);
+ }
+
+ WT_RET(__rec_col_fix_bulk_insert_split_check(cbulk));
+
+ __bit_setv(r->first_free,
+ cbulk->entry, btree->bitcnt, ((uint8_t *)cursor->value.data)[0]);
+ ++cbulk->entry;
+ ++r->recno;
+
+ return (0);
+}
+
+/*
+ * __wt_bulk_insert_var --
+ * Variable-length column-store bulk insert.
+ */
+int
+__wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
+{
+ WT_BTREE *btree;
+ WT_KV *val;
+ WT_RECONCILE *r;
+
+ r = cbulk->reconcile;
+ btree = S2BT(session);
+
+ /*
+ * Store the bulk cursor's last buffer, not the current value, we're
+ * creating a duplicate count, which means we want the previous value
+ * seen, not the current value.
+ */
+ val = &r->v;
+ WT_RET(__rec_cell_build_val(
+ session, r, cbulk->last.data, cbulk->last.size, cbulk->rle));
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ if (btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, cbulk->rle, val));
+ __rec_copy_incr(session, r, val);
+
+ /* Update the starting record number in case we split. */
+ r->recno += cbulk->rle;
+
+ return (0);
+}
+
+/*
+ * __rec_vtype --
+ * Return a value cell's address type.
+ */
+static inline u_int
+__rec_vtype(WT_ADDR *addr)
+{
+ if (addr->type == WT_ADDR_INT)
+ return (WT_CELL_ADDR_INT);
+ if (addr->type == WT_ADDR_LEAF)
+ return (WT_CELL_ADDR_LEAF);
+ return (WT_CELL_ADDR_LEAF_NO);
+}
+
+/*
+ * __rec_col_int --
+ * Reconcile a column-store internal page.
+ */
+static int
+__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_BTREE *btree;
+ WT_CELL_UNPACK *vpack, _vpack;
+ WT_DECL_RET;
+ WT_KV *val;
+ WT_PAGE *child;
+ WT_REF *ref;
+ int hazard, state;
+
+ btree = S2BT(session);
+ child = NULL;
+ hazard = 0;
+
+ val = &r->v;
+ vpack = &_vpack;
+
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_intl_recno, btree->maxintlpage));
+
+ /* For each entry in the in-memory page... */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /* Update the starting record number in case we split. */
+ r->recno = ref->key.recno;
+
+ /*
+ * Modified child.
+ * The page may be emptied or internally created during a split.
+ * Deleted/split pages are merged into the parent and discarded.
+ */
+ WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+ addr = NULL;
+ child = ref->page;
+ if (state != 0) {
+ /*
+ * Currently the only non-zero returned stated possible
+ * for a column-store page is child-modified (all other
+ * states are part of the fast-truncate support, which
+ * is row-store only).
+ */
+ WT_ASSERT(session, state == WT_CHILD_MODIFIED);
+
+ switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY:
+ /*
+ * Column-store pages are almost never empty, as
+ * discarding a page would remove a chunk of the
+ * name space. The exceptions are pages created
+ * when the tree is created, and never filled.
+ */
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_MULTIBLOCK:
+ WT_ERR(__rec_col_merge(session, r, child));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_REPLACE:
+ addr = &child->modify->mod_replace;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+ }
+
+ /*
+ * Build the value cell. The child page address is in one of 3
+ * places: if the page was replaced, the page's modify structure
+ * references it and we built the value cell just above in the
+ * switch statement. Else, the WT_REF->addr reference points to
+ * an on-page cell or an off-page WT_ADDR structure: if it's an
+ * on-page cell and we copy it from the page, else build a new
+ * cell.
+ */
+ if (addr == NULL && __wt_off_page(page, ref->addr))
+ addr = ref->addr;
+ if (addr == NULL) {
+ __wt_cell_unpack(ref->addr, vpack);
+ val->buf.data = ref->addr;
+ val->buf.size = __wt_cell_total_len(vpack);
+ val->cell_len = 0;
+ val->len = val->buf.size;
+ } else
+ __rec_cell_build_addr(r, addr->addr, addr->size,
+ __rec_vtype(addr), ref->key.recno);
+ CHILD_RELEASE_ERR(session, hazard, ref);
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_ERR(__rec_split_raw(session, r));
+ else
+ WT_ERR(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ __rec_copy_incr(session, r, val);
+ } WT_INTL_FOREACH_END;
+
+ /* Write the remnant page. */
+ return (__rec_split_finish(session, r));
+
+err: CHILD_RELEASE(session, hazard, ref);
+ return (ret);
+}
+
+/*
+ * __rec_col_merge --
+ * Merge in a split page.
+ */
+static int
+__rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_KV *val;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ mod = page->modify;
+
+ val = &r->v;
+
+ /* For each entry in the split array... */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ /* Update the starting record number in case we split. */
+ r->recno = multi->key.recno;
+
+ /* Build the value cell. */
+ addr = &multi->addr;
+ __rec_cell_build_addr(r,
+ addr->addr, addr->size, __rec_vtype(addr), r->recno);
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ __rec_copy_incr(session, r, val);
+ }
+ return (0);
+}
+
+/*
+ * __rec_col_fix --
+ * Reconcile a fixed-width, column-store leaf page.
+ */
+static int
+__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_INSERT *ins;
+ WT_UPDATE *upd;
+ uint64_t recno;
+ uint32_t entry, nrecs;
+
+ btree = S2BT(session);
+
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+ /* Update any changes to the original on-page data items. */
+ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
+ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd != NULL)
+ __bit_setv_recno(page, WT_INSERT_RECNO(ins),
+ btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ }
+
+ /* Copy the updated, disk-image bytes into place. */
+ memcpy(r->first_free, page->pg_fix_bitf,
+ __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt));
+
+ /* Calculate the number of entries per page remainder. */
+ entry = page->pg_fix_entries;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(
+ btree, r->space_avail) - page->pg_fix_entries;
+ r->recno += entry;
+
+ /* Walk any append list. */
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL)
+ continue;
+ for (;;) {
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space.
+ */
+ for (recno = WT_INSERT_RECNO(ins);
+ nrecs > 0 && r->recno < recno;
+ --nrecs, ++entry, ++r->recno)
+ __bit_setv(
+ r->first_free, entry, btree->bitcnt, 0);
+
+ if (nrecs > 0) {
+ __bit_setv(r->first_free, entry, btree->bitcnt,
+ ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ --nrecs;
+ ++entry;
+ ++r->recno;
+ break;
+ }
+
+ /*
+ * If everything didn't fit, update the counters and
+ * split.
+ *
+ * Boundary: split or write the page.
+ */
+ __rec_incr(session, r, entry,
+ __bitstr_size((size_t)entry * btree->bitcnt));
+ WT_RET(__rec_split(session, r));
+
+ /* Calculate the number of entries per page. */
+ entry = 0;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+ }
+ }
+
+ /* Update the counters. */
+ __rec_incr(
+ session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt));
+
+ /* Write the remnant page. */
+ return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_fix_slvg --
+ * Reconcile a fixed-width, column-store leaf page created during salvage.
+ */
+static int
+__rec_col_fix_slvg(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+ uint64_t page_start, page_take;
+ uint32_t entry, nrecs;
+
+ btree = S2BT(session);
+
+ /*
+ * !!!
+ * It's vanishingly unlikely and probably impossible for fixed-length
+ * column-store files to have overlapping key ranges. It's possible
+ * for an entire key range to go missing (if a page is corrupted and
+ * lost), but because pages can't split, it shouldn't be possible to
+ * find pages where the key ranges overlap. That said, we check for
+ * it during salvage and clean up after it here because it doesn't
+ * cost much and future column-store formats or operations might allow
+ * for fixed-length format ranges to overlap during salvage, and I
+ * don't want to have to retrofit the code later.
+ */
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_fix_recno, btree->maxleafpage));
+
+ /* We may not be taking all of the entries on the original page. */
+ page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take;
+ page_start = salvage->skip == 0 ? 0 : salvage->skip;
+
+ /* Calculate the number of entries per page. */
+ entry = 0;
+ nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail);
+
+ for (; nrecs > 0 && salvage->missing > 0;
+ --nrecs, --salvage->missing, ++entry)
+ __bit_setv(r->first_free, entry, btree->bitcnt, 0);
+
+ for (; nrecs > 0 && page_take > 0;
+ --nrecs, --page_take, ++page_start, ++entry)
+ __bit_setv(r->first_free, entry, btree->bitcnt,
+ __bit_getv(page->pg_fix_bitf,
+ (uint32_t)page_start, btree->bitcnt));
+
+ r->recno += entry;
+ __rec_incr(session, r, entry,
+ __bitstr_size((size_t)entry * btree->bitcnt));
+
+ /*
+ * We can't split during salvage -- if everything didn't fit, it's
+ * all gone wrong.
+ */
+ if (salvage->missing != 0 || page_take != 0)
+ WT_PANIC_RET(session, WT_PANIC,
+ "%s page too large, attempted split during salvage",
+ __wt_page_type_string(page->type));
+
+ /* Write the page. */
+ return (__rec_split_finish(session, r));
+}
+
+/*
+ * __rec_col_var_helper --
+ * Create a column-store variable length record cell and write it onto a
+ * page.
+ */
+static int
+__rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ WT_SALVAGE_COOKIE *salvage,
+ WT_ITEM *value, int deleted, uint8_t overflow_type, uint64_t rle)
+{
+ WT_BTREE *btree;
+ WT_KV *val;
+
+ btree = S2BT(session);
+
+ val = &r->v;
+
+ /*
+ * Occasionally, salvage needs to discard records from the beginning or
+ * end of the page, and because the items may be part of a RLE cell, do
+ * the adjustments here. It's not a mistake we don't bother telling
+ * our caller we've handled all the records from the page we care about,
+ * and can quit processing the page: salvage is a rare operation and I
+ * don't want to complicate our caller's loop.
+ */
+ if (salvage != NULL) {
+ if (salvage->done)
+ return (0);
+ if (salvage->skip != 0) {
+ if (rle <= salvage->skip) {
+ salvage->skip -= rle;
+ return (0);
+ }
+ rle -= salvage->skip;
+ salvage->skip = 0;
+ }
+ if (salvage->take != 0) {
+ if (rle <= salvage->take)
+ salvage->take -= rle;
+ else {
+ rle = salvage->take;
+ salvage->take = 0;
+ }
+ if (salvage->take == 0)
+ salvage->done = 1;
+ }
+ }
+
+ if (deleted) {
+ val->cell_len = __wt_cell_pack_del(&val->cell, rle);
+ val->buf.data = NULL;
+ val->buf.size = 0;
+ val->len = val->cell_len;
+ } else if (overflow_type) {
+ val->cell_len = __wt_cell_pack_ovfl(
+ &val->cell, overflow_type, rle, value->size);
+ val->buf.data = value->data;
+ val->buf.size = value->size;
+ val->len = val->cell_len + value->size;
+ } else
+ WT_RET(__rec_cell_build_val(
+ session, r, value->data, value->size, rle));
+
+ /* Boundary: split or write the page. */
+ while (val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the value onto the page. */
+ if (!deleted && !overflow_type && btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, rle, val));
+ __rec_copy_incr(session, r, val);
+
+ /* Update the starting record number in case we split. */
+ r->recno += rle;
+
+ return (0);
+}
+
+/*
+ * __rec_col_var --
+ * Reconcile a variable-width column-store leaf page.
+ */
+static int
+__rec_col_var(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *vpack, _vpack;
+ WT_COL *cip;
+ WT_DECL_ITEM(orig);
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_ITEM *last;
+ WT_UPDATE *upd;
+ uint64_t n, nrepeat, repeat_count, rle, src_recno;
+ uint32_t i, size;
+ int deleted, last_deleted, orig_deleted, update_no_copy;
+ const void *data;
+
+ btree = S2BT(session);
+ last = r->last;
+ vpack = &_vpack;
+
+ WT_RET(__wt_scr_alloc(session, 0, &orig));
+ data = NULL;
+ size = 0;
+ upd = NULL;
+
+ WT_RET(__rec_split_init(
+ session, r, page, page->pg_var_recno, btree->maxleafpage));
+
+ /*
+ * The salvage code may be calling us to reconcile a page where there
+ * were missing records in the column-store name space. If taking the
+ * first record from on the page, it might be a deleted record, so we
+ * have to give the RLE code a chance to figure that out. Else, if
+ * not taking the first record from the page, write a single element
+ * representing the missing records onto a new page. (Don't pass the
+ * salvage cookie to our helper function in this case, we're handling
+ * one of the salvage cookie fields on our own, and we don't need the
+ * helper function's assistance.)
+ */
+ rle = 0;
+ last_deleted = 0;
+ if (salvage != NULL && salvage->missing != 0) {
+ if (salvage->skip == 0) {
+ rle = salvage->missing;
+ last_deleted = 1;
+
+ /*
+ * Correct the number of records we're going to "take",
+ * pretending the missing records were on the page.
+ */
+ salvage->take += salvage->missing;
+ } else
+ WT_ERR(__rec_col_var_helper(
+ session, r, NULL, NULL, 1, 0, salvage->missing));
+ }
+
+ /*
+ * We track two data items through this loop: the previous (last) item
+ * and the current item: if the last item is the same as the current
+ * item, we increment the RLE count for the last item; if the last item
+ * is different from the current item, we write the last item onto the
+ * page, and replace it with the current item. The r->recno counter
+ * tracks records written to the page, and is incremented by the helper
+ * function immediately after writing records to the page. The record
+ * number of our source record, that is, the current item, is maintained
+ * in src_recno.
+ */
+ src_recno = r->recno + rle;
+
+ /* For each entry in the in-memory page... */
+ WT_COL_FOREACH(page, cip, i) {
+ ovfl_state = OVFL_IGNORE;
+ if ((cell = WT_COL_PTR(page, cip)) == NULL) {
+ nrepeat = 1;
+ ins = NULL;
+ orig_deleted = 1;
+ } else {
+ __wt_cell_unpack(cell, vpack);
+ nrepeat = __wt_cell_rle(vpack);
+ ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
+
+ /*
+ * If the original value is "deleted", there's no value
+ * to compare, we're done.
+ */
+ orig_deleted = vpack->type == WT_CELL_DEL ? 1 : 0;
+ if (orig_deleted)
+ goto record_loop;
+
+ /*
+ * Overflow items are tricky: we don't know until we're
+ * finished processing the set of values if we need the
+ * overflow value or not. If we don't use the overflow
+ * item at all, we have to discard it from the backing
+ * file, otherwise we'll leak blocks on the checkpoint.
+ * That's safe because if the backing overflow value is
+ * still needed by any running transaction, we'll cache
+ * a copy in the reconciliation tracking structures.
+ *
+ * Regardless, we avoid copying in overflow records: if
+ * there's a WT_INSERT entry that modifies a reference
+ * counted overflow record, we may have to write copies
+ * of the overflow record, and in that case we'll do the
+ * comparisons, but we don't read overflow items just to
+ * see if they match records on either side.
+ */
+ if (vpack->ovfl) {
+ ovfl_state = OVFL_UNUSED;
+ goto record_loop;
+ }
+
+ /*
+ * If data is Huffman encoded, we have to decode it in
+ * order to compare it with the last item we saw, which
+ * may have been an update string. This guarantees we
+ * find every single pair of objects we can RLE encode,
+ * including applications updating an existing record
+ * where the new value happens (?) to match a Huffman-
+ * encoded value in a previous or next record.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_COL_VAR, vpack, orig));
+ }
+
+record_loop: /*
+ * Generate on-page entries: loop repeat records, looking for
+ * WT_INSERT entries matching the record number. The WT_INSERT
+ * lists are in sorted order, so only need check the next one.
+ */
+ for (n = 0;
+ n < nrepeat; n += repeat_count, src_recno += repeat_count) {
+ upd = NULL;
+ if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
+ WT_ERR(__rec_txn_read(
+ session, r, ins, NULL, vpack, &upd));
+ ins = WT_SKIP_NEXT(ins);
+ }
+ if (upd != NULL) {
+ update_no_copy = 1; /* No data copy */
+ repeat_count = 1; /* Single record */
+
+ deleted = WT_UPDATE_DELETED_ISSET(upd);
+ if (!deleted) {
+ data = WT_UPDATE_DATA(upd);
+ size = upd->size;
+ }
+ } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+ update_no_copy = 1; /* No data copy */
+ repeat_count = 1; /* Single record */
+
+ deleted = 0;
+
+ /*
+ * If doing update save and restore, there's an
+ * update that's not globally visible, and the
+ * underlying value is a removed overflow value,
+ * we end up here.
+ *
+ * When the update save/restore code noticed the
+ * removed overflow value, it appended a copy of
+ * the cached, original overflow value to the
+ * update list being saved (ensuring the on-page
+ * item will never be accessed after the page is
+ * re-instantiated), then returned a NULL update
+ * to us.
+ *
+ * Assert the case: if we remove an underlying
+ * overflow object, checkpoint reconciliation
+ * should never see it again, there should be a
+ * visible update in the way.
+ *
+ * Write a placeholder.
+ */
+ WT_ASSERT(session,
+ F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+ data = "@";
+ size = 1;
+ } else {
+ update_no_copy = 0; /* Maybe data copy */
+
+ /*
+ * The repeat count is the number of records up
+ * to the next WT_INSERT record, or up to the
+ * end of the entry if we have no more WT_INSERT
+ * records.
+ */
+ if (ins == NULL)
+ repeat_count = nrepeat - n;
+ else
+ repeat_count =
+ WT_INSERT_RECNO(ins) - src_recno;
+
+ deleted = orig_deleted;
+ if (deleted)
+ goto compare;
+
+ /*
+ * If we are handling overflow items, use the
+ * overflow item itself exactly once, after
+ * which we have to copy it into a buffer and
+ * from then on use a complete copy because we
+ * are re-creating a new overflow record each
+ * time.
+ */
+ switch (ovfl_state) {
+ case OVFL_UNUSED:
+ /*
+ * An as-yet-unused overflow item.
+ *
+ * We're going to copy the on-page cell,
+ * write out any record we're tracking.
+ */
+ if (rle != 0) {
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last,
+ last_deleted, 0, rle));
+ rle = 0;
+ }
+
+ last->data = vpack->data;
+ last->size = vpack->size;
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last, 0,
+ WT_CELL_VALUE_OVFL, repeat_count));
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = 1;
+
+ ovfl_state = OVFL_USED;
+ continue;
+ case OVFL_USED:
+ /*
+ * Original is an overflow item; we used
+ * it for a key and now we need another
+ * copy; read it into memory.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(session,
+ WT_PAGE_COL_VAR, vpack, orig));
+
+ ovfl_state = OVFL_IGNORE;
+ /* FALLTHROUGH */
+ case OVFL_IGNORE:
+ /*
+ * Original is an overflow item and we
+ * were forced to copy it into memory,
+ * or the original wasn't an overflow
+ * item; use the data copied into orig.
+ */
+ data = orig->data;
+ size = (uint32_t)orig->size;
+ break;
+ }
+ }
+
+compare: /*
+ * If we have a record against which to compare, and
+ * the records compare equal, increment the rle counter
+ * and continue. If the records don't compare equal,
+ * output the last record and swap the last and current
+ * buffers: do NOT update the starting record number,
+ * we've been doing that all along.
+ */
+ if (rle != 0) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ rle += repeat_count;
+ continue;
+ }
+ WT_ERR(__rec_col_var_helper(session, r,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /*
+ * Swap the current/last state.
+ *
+ * Reset RLE counter and turn on comparisons.
+ */
+ if (!deleted) {
+ /*
+ * We can't simply assign the data values into
+ * the last buffer because they may have come
+ * from a copy built from an encoded/overflow
+ * cell and creating the next record is going
+ * to overwrite that memory. Check, because
+ * encoded/overflow cells aren't that common
+ * and we'd like to avoid the copy. If data
+ * was taken from the current unpack structure
+ * (which points into the page), or was taken
+ * from an update structure, we can just use
+ * the pointers, they're not moving.
+ */
+ if (data == vpack->data || update_no_copy) {
+ last->data = data;
+ last->size = size;
+ } else
+ WT_ERR(__wt_buf_set(
+ session, last, data, size));
+ }
+ last_deleted = deleted;
+ rle = repeat_count;
+ }
+
+ /*
+ * If we had a reference to an overflow record we never used,
+ * discard the underlying blocks, they're no longer useful.
+ *
+ * One complication: we must cache a copy before discarding the
+ * on-disk version if there's a transaction in the system that
+ * might read the original value.
+ */
+ if (ovfl_state == OVFL_UNUSED &&
+ vpack->raw != WT_CELL_VALUE_OVFL_RM)
+ WT_ERR(__wt_ovfl_cache(session, page, upd, vpack));
+ }
+
+ /* Walk any append list. */
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+ WT_ERR(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL)
+ continue;
+ for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
+ /*
+ * The application may have inserted records which left
+ * gaps in the name space.
+ */
+ if (src_recno < n)
+ deleted = 1;
+ else {
+ deleted = WT_UPDATE_DELETED_ISSET(upd);
+ if (!deleted) {
+ data = WT_UPDATE_DATA(upd);
+ size = upd->size;
+ }
+ }
+
+ /*
+ * Handle RLE accounting and comparisons -- see comment
+ * above, this code fragment does the same thing.
+ */
+ if (rle != 0) {
+ if ((deleted && last_deleted) ||
+ (!last_deleted && !deleted &&
+ last->size == size &&
+ memcmp(last->data, data, size) == 0)) {
+ ++rle;
+ continue;
+ }
+ WT_ERR(__rec_col_var_helper(session, r,
+ salvage, last, last_deleted, 0, rle));
+ }
+
+ /*
+ * Swap the current/last state. We always assign the
+ * data values to the buffer because they can only be
+ * the data from a WT_UPDATE structure.
+ *
+ * Reset RLE counter and turn on comparisons.
+ */
+ if (!deleted) {
+ last->data = data;
+ last->size = size;
+ }
+ last_deleted = deleted;
+ rle = 1;
+ }
+ }
+
+ /* If we were tracking a record, write it. */
+ if (rle != 0)
+ WT_ERR(__rec_col_var_helper(
+ session, r, salvage, last, last_deleted, 0, rle));
+
+ /* Write the remnant page. */
+ ret = __rec_split_finish(session, r);
+
+err: __wt_scr_free(&orig);
+ return (ret);
+}
+
+/*
+ * __rec_row_int --
+ * Reconcile a row-store internal page.
+ */
+static int
+__rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_KV *key, *val;
+ WT_PAGE *child;
+ WT_REF *ref;
+ size_t size;
+ u_int vtype;
+ int hazard, key_onpage_ovfl, ovfl_key, state;
+ const void *p;
+
+ btree = S2BT(session);
+ child = NULL;
+ hazard = 0;
+
+ key = &r->k;
+ kpack = &_kpack;
+ WT_CLEAR(*kpack); /* -Wuninitialized */
+ val = &r->v;
+ vpack = &_vpack;
+ WT_CLEAR(*vpack); /* -Wuninitialized */
+
+ WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxintlpage));
+
+ /*
+ * Ideally, we'd never store the 0th key on row-store internal pages
+ * because it's never used during tree search and there's no reason
+ * to waste the space. The problem is how we do splits: when we split,
+ * we've potentially picked out several "split points" in the buffer
+ * which is overflowing the maximum page size, and when the overflow
+ * happens, we go back and physically split the buffer, at those split
+ * points, into new pages. It would be both difficult and expensive
+ * to re-process the 0th key at each split point to be an empty key,
+ * so we don't do that. However, we are reconciling an internal page
+ * for whatever reason, and the 0th key is known to be useless. We
+ * truncate the key to a single byte, instead of removing it entirely,
+ * it simplifies various things in other parts of the code (we don't
+ * have to special case transforming the page from its disk image to
+ * its in-memory version, for example).
+ */
+ r->cell_zero = 1;
+
+ /* For each entry in the in-memory page... */
+ WT_INTL_FOREACH_BEGIN(session, page, ref) {
+ /*
+ * There are different paths if the key is an overflow item vs.
+ * a straight-forward on-page value. If an overflow item, we
+ * would have instantiated it, and we can use that fact to set
+ * things up.
+ *
+ * Note the cell reference and unpacked key cell are available
+ * only in the case of an instantiated, off-page key.
+ */
+ ikey = __wt_ref_key_instantiated(ref);
+ if (ikey == NULL || ikey->cell_offset == 0) {
+ cell = NULL;
+ key_onpage_ovfl = 0;
+ } else {
+ cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ __wt_cell_unpack(cell, kpack);
+ key_onpage_ovfl =
+ kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+ }
+
+ WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state));
+ addr = ref->addr;
+ child = ref->page;
+ vtype = 0;
+
+ /* Deleted child we don't have to write. */
+ if (state == WT_CHILD_IGNORE) {
+ /*
+ * Overflow keys referencing discarded pages are no
+ * longer useful, schedule them for discard. Don't
+ * worry about instantiation, internal page keys are
+ * always instantiated. Don't worry about reuse,
+ * reusing this key in this reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ }
+
+ /* Deleted child requiring a proxy cell. */
+ if (state == WT_CHILD_PROXY)
+ vtype = WT_CELL_ADDR_DEL;
+
+ /*
+ * Modified child. Empty pages are merged into the parent and
+ * discarded.
+ */
+ if (state == WT_CHILD_MODIFIED)
+ switch (F_ISSET(child->modify, WT_PM_REC_MASK)) {
+ case WT_PM_REC_EMPTY:
+ /*
+ * Overflow keys referencing empty pages are no
+ * longer useful, schedule them for discard.
+ * Don't worry about instantiation, internal
+ * page keys are always instantiated. Don't
+ * worry about reuse, reusing this key in this
+ * reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_MULTIBLOCK:
+ /*
+ * Overflow keys referencing split pages are no
+ * longer useful (the split page's key is the
+ * interesting key); schedule them for discard.
+ * Don't worry about instantiation, internal
+ * page keys are always instantiated. Don't
+ * worry about reuse, reusing this key in this
+ * reconciliation is unlikely.
+ */
+ if (key_onpage_ovfl)
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+
+ WT_ERR(__rec_row_merge(session, r, child));
+ CHILD_RELEASE_ERR(session, hazard, ref);
+ continue;
+ case WT_PM_REC_REPLACE:
+ /*
+ * If the page is replaced, the page's modify
+ * structure has the page's address.
+ */
+ addr = &child->modify->mod_replace;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * Build the value cell, the child page's address. Addr points
+ * to an on-page cell or an off-page WT_ADDR structure. The
+ * cell type has been set in the case of page deletion requiring
+ * a proxy cell, otherwise use the information from the addr or
+ * original cell.
+ */
+ if (__wt_off_page(page, addr)) {
+ p = addr->addr;
+ size = addr->size;
+ if (vtype == 0)
+ vtype = __rec_vtype(addr);
+ } else {
+ __wt_cell_unpack(ref->addr, vpack);
+ p = vpack->data;
+ size = vpack->size;
+ if (vtype == 0)
+ vtype = vpack->raw;
+ }
+ __rec_cell_build_addr(r, p, size, vtype, 0);
+ CHILD_RELEASE_ERR(session, hazard, ref);
+
+ /*
+ * Build key cell.
+ * Truncate any 0th key, internal pages don't need 0th keys.
+ */
+ if (key_onpage_ovfl) {
+ key->buf.data = cell;
+ key->buf.size = __wt_cell_total_len(kpack);
+ key->cell_len = 0;
+ key->len = key->buf.size;
+ ovfl_key = 1;
+ } else {
+ __wt_ref_key(page, ref, &p, &size);
+ WT_ERR(__rec_cell_build_int_key(
+ session, r, p, r->cell_zero ? 1 : size, &ovfl_key));
+ }
+ r->cell_zero = 0;
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail) {
+ if (r->raw_compression) {
+ WT_ERR(__rec_split_raw(session, r));
+ continue;
+ }
+
+ /*
+ * In one path above, we copied address blocks from the
+ * page rather than building the actual key. In that
+ * case, we have to build the actual key now because we
+ * are about to promote it.
+ */
+ if (key_onpage_ovfl) {
+ WT_ERR(__wt_buf_set(session,
+ r->cur, WT_IKEY_DATA(ikey), ikey->size));
+ key_onpage_ovfl = 0;
+ }
+ WT_ERR(__rec_split(session, r));
+ }
+
+ /* Copy the key and value onto the page. */
+ __rec_copy_incr(session, r, key);
+ __rec_copy_incr(session, r, val);
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ } WT_INTL_FOREACH_END;
+
+ /* Write the remnant page. */
+ return (__rec_split_finish(session, r));
+
+err: CHILD_RELEASE(session, hazard, ref);
+ return (ret);
+}
+
+/*
+ * __rec_row_merge --
+ * Merge in a split page.
+ */
+static int
+__rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_ADDR *addr;
+ WT_KV *key, *val;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+ int ovfl_key;
+
+ mod = page->modify;
+
+ key = &r->k;
+ val = &r->v;
+
+ /* For each entry in the split array... */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ /* Build the key and value cells. */
+ WT_RET(__rec_cell_build_int_key(session, r,
+ WT_IKEY_DATA(multi->key.ikey),
+ r->cell_zero ? 1 : multi->key.ikey->size, &ovfl_key));
+ r->cell_zero = 0;
+
+ addr = &multi->addr;
+ __rec_cell_build_addr(
+ r, addr->addr, addr->size, __rec_vtype(addr), 0);
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail)
+ if (r->raw_compression)
+ WT_RET(__rec_split_raw(session, r));
+ else
+ WT_RET(__rec_split(session, r));
+
+ /* Copy the key and value onto the page. */
+ __rec_copy_incr(session, r, key);
+ __rec_copy_incr(session, r, val);
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ }
+ return (0);
+}
+
+/*
+ * __rec_row_leaf --
+ * Reconcile a row-store leaf page.
+ */
+static int
+__rec_row_leaf(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+{
+ WT_BTREE *btree;
+ WT_CELL *cell, *val_cell;
+ WT_CELL_UNPACK *kpack, _kpack, *vpack, _vpack;
+ WT_DECL_ITEM(tmpkey);
+ WT_DECL_ITEM(tmpval);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_INSERT *ins;
+ WT_KV *key, *val;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ size_t size;
+ uint64_t slvg_skip;
+ uint32_t i;
+ int dictionary, onpage_ovfl, ovfl_key;
+ const void *p;
+ void *copy;
+
+ btree = S2BT(session);
+ slvg_skip = salvage == NULL ? 0 : salvage->skip;
+
+ key = &r->k;
+ val = &r->v;
+
+ WT_RET(__rec_split_init(session, r, page, 0ULL, btree->maxleafpage));
+
+ /*
+ * Write any K/V pairs inserted into the page before the first from-disk
+ * key on the page.
+ */
+ if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL)
+ WT_RET(__rec_row_leaf_insert(session, r, ins));
+
+ /*
+ * Temporary buffers in which to instantiate any uninstantiated keys
+ * or value items we need.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &tmpkey));
+ WT_RET(__wt_scr_alloc(session, 0, &tmpval));
+
+ /* For each entry in the page... */
+ WT_ROW_FOREACH(page, rip, i) {
+ /*
+ * The salvage code, on some rare occasions, wants to reconcile
+ * a page but skip some leading records on the page. Because
+ * the row-store leaf reconciliation function copies keys from
+ * the original disk page, this is non-trivial -- just changing
+ * the in-memory pointers isn't sufficient, we have to change
+ * the WT_CELL structures on the disk page, too. It's ugly, but
+ * we pass in a value that tells us how many records to skip in
+ * this case.
+ */
+ if (slvg_skip != 0) {
+ --slvg_skip;
+ continue;
+ }
+
+ /*
+ * Figure out the key: set any cell reference (and unpack it),
+ * set any instantiated key reference.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, NULL, NULL);
+ if (cell == NULL)
+ kpack = NULL;
+ else {
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ }
+
+ /* Unpack the on-page value cell, and look for an update. */
+ if ((val_cell =
+ __wt_row_leaf_value_cell(page, rip, NULL)) == NULL)
+ vpack = NULL;
+ else {
+ vpack = &_vpack;
+ __wt_cell_unpack(val_cell, vpack);
+ }
+ WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd));
+
+ /* Build value cell. */
+ dictionary = 0;
+ if (upd == NULL) {
+ /*
+ * When the page was read into memory, there may not
+ * have been a value item.
+ *
+ * If there was a value item, check if it's a dictionary
+ * cell (a copy of another item on the page). If it's a
+ * copy, we have to create a new value item as the old
+ * item might have been discarded from the page.
+ */
+ if (vpack == NULL) {
+ val->buf.data = NULL;
+ val->cell_len = val->len = val->buf.size = 0;
+ } else if (vpack->raw == WT_CELL_VALUE_COPY) {
+ /* If the item is Huffman encoded, decode it. */
+ if (btree->huffman_value == NULL) {
+ p = vpack->data;
+ size = vpack->size;
+ } else {
+ WT_ERR(__wt_huffman_decode(session,
+ btree->huffman_value,
+ vpack->data, vpack->size,
+ tmpval));
+ p = tmpval->data;
+ size = tmpval->size;
+ }
+ WT_ERR(__rec_cell_build_val(
+ session, r, p, size, (uint64_t)0));
+ dictionary = 1;
+ } else if (vpack->raw == WT_CELL_VALUE_OVFL_RM) {
+ /*
+ * If doing update save and restore in service
+ * of eviction, there's an update that's not
+ * globally visible, and the underlying value
+ * is a removed overflow value, we end up here.
+ *
+ * When the update save/restore code noticed the
+ * removed overflow value, it appended a copy of
+ * the cached, original overflow value to the
+ * update list being saved (ensuring any on-page
+ * item will never be accessed after the page is
+ * re-instantiated), then returned a NULL update
+ * to us.
+ *
+ * Assert the case.
+ */
+ WT_ASSERT(session,
+ F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+
+ /*
+ * If the key is also a removed overflow item,
+ * don't write anything at all.
+ *
+ * We don't have to write anything because the
+ * code re-instantiating the page gets the key
+ * to match the saved list of updates from the
+ * original page. By not putting the key on
+ * the page, we'll move the key/value set from
+ * a row-store leaf page slot to an insert list,
+ * but that shouldn't matter.
+ *
+ * The reason we bother with the test is because
+ * overflows are expensive to write. It's hard
+ * to imagine a real workload where this test is
+ * worth the effort, but it's a simple test.
+ */
+ if (kpack != NULL &&
+ kpack->raw == WT_CELL_KEY_OVFL_RM)
+ goto leaf_insert;
+
+ /*
+ * The on-page value will never be accessed,
+ * write a placeholder record.
+ */
+ WT_ERR(__rec_cell_build_val(
+ session, r, "@", 1, (uint64_t)0));
+ } else {
+ val->buf.data = val_cell;
+ val->buf.size = __wt_cell_total_len(vpack);
+ val->cell_len = 0;
+ val->len = val->buf.size;
+
+ /* Track if page has overflow items. */
+ if (vpack->ovfl)
+ r->ovfl_items = 1;
+ }
+ } else {
+ /*
+ * If the original value was an overflow and we've not
+ * already done so, discard it. One complication: we
+ * must cache a copy before discarding the on-disk
+ * version if there's a transaction in the system that
+ * might read the original value.
+ */
+ if (vpack != NULL &&
+ vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
+ WT_ERR(
+ __wt_ovfl_cache(session, page, rip, vpack));
+
+ /* If this key/value pair was deleted, we're done. */
+ if (WT_UPDATE_DELETED_ISSET(upd)) {
+ /*
+ * Overflow keys referencing discarded values
+ * are no longer useful, discard the backing
+ * blocks. Don't worry about reuse, reusing
+ * keys from a row-store page reconciliation
+ * seems unlikely enough to ignore.
+ */
+ if (kpack != NULL && kpack->ovfl &&
+ kpack->raw != WT_CELL_KEY_OVFL_RM) {
+ /*
+ * Keys are part of the name-space, we
+ * can't remove them from the in-memory
+ * tree; if an overflow key was deleted
+ * without being instantiated (for
+ * example, cursor-based truncation, do
+ * it now.
+ */
+ if (ikey == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session,
+ page, rip, tmpkey, 1));
+
+ WT_ERR(__wt_ovfl_discard_add(
+ session, page, kpack->cell));
+ }
+
+ /*
+ * We aren't actually creating the key so we
+ * can't use bytes from this key to provide
+ * prefix information for a subsequent key.
+ */
+ tmpkey->size = 0;
+
+ /* Proceed with appended key/value pairs. */
+ goto leaf_insert;
+ }
+
+ /*
+ * If no value, nothing needs to be copied. Otherwise,
+ * build the value's WT_CELL chunk from the most recent
+ * update value.
+ */
+ if (upd->size == 0) {
+ val->buf.data = NULL;
+ val->cell_len = val->len = val->buf.size = 0;
+ } else {
+ WT_ERR(__rec_cell_build_val(session, r,
+ WT_UPDATE_DATA(upd), upd->size,
+ (uint64_t)0));
+ dictionary = 1;
+ }
+ }
+
+ /*
+ * Build key cell.
+ *
+ * If the key is an overflow key that hasn't been removed, use
+ * the original backing blocks.
+ */
+ onpage_ovfl = kpack != NULL &&
+ kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM;
+ if (onpage_ovfl) {
+ key->buf.data = cell;
+ key->buf.size = __wt_cell_total_len(kpack);
+ key->cell_len = 0;
+ key->len = key->buf.size;
+ ovfl_key = 1;
+
+ /*
+ * We aren't creating a key so we can't use this key as
+ * a prefix for a subsequent key.
+ */
+ tmpkey->size = 0;
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = 1;
+ } else {
+ /*
+ * Get the key from the page or an instantiated key, or
+ * inline building the key from a previous key (it's a
+ * fast path for simple, prefix-compressed keys), or by
+ * by building the key from scratch.
+ */
+ if (__wt_row_leaf_key_info(page, copy,
+ NULL, &cell, &tmpkey->data, &tmpkey->size))
+ goto build;
+
+ kpack = &_kpack;
+ __wt_cell_unpack(cell, kpack);
+ if (btree->huffman_key == NULL &&
+ kpack->type == WT_CELL_KEY &&
+ tmpkey->size >= kpack->prefix) {
+ /*
+ * The previous clause checked for a prefix of
+ * zero, which means the temporary buffer must
+ * have a non-zero size, and it references a
+ * valid key.
+ */
+ WT_ASSERT(session, tmpkey->size != 0);
+
+ /*
+ * Grow the buffer as necessary, ensuring data
+ * data has been copied into local buffer space,
+ * then append the suffix to the prefix already
+ * in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy
+ * data we don't need, truncate the item's data
+ * length to the prefix bytes.
+ */
+ tmpkey->size = kpack->prefix;
+ WT_ERR(__wt_buf_grow(session,
+ tmpkey, tmpkey->size + kpack->size));
+ memcpy((uint8_t *)tmpkey->mem + tmpkey->size,
+ kpack->data, kpack->size);
+ tmpkey->size += kpack->size;
+ } else
+ WT_ERR(__wt_row_leaf_key_copy(
+ session, page, rip, tmpkey));
+build:
+ WT_ERR(__rec_cell_build_leaf_key(session, r,
+ tmpkey->data, tmpkey->size, &ovfl_key));
+ }
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail) {
+ if (r->raw_compression) {
+ WT_ERR(__rec_split_raw(session, r));
+ continue;
+ }
+
+ /*
+ * In one path above, we copied address blocks from the
+ * page rather than building the actual key. In that
+ * case, we have to build the actual key now because we
+ * are about to promote it.
+ */
+ if (onpage_ovfl) {
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_LEAF, kpack, r->cur));
+ onpage_ovfl = 0;
+ }
+ WT_ERR(__rec_split(session, r));
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless we're already working
+ * with an overflow key), rebuild the key without prefix
+ * compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_ERR(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = 1;
+ else {
+ r->all_empty_value = 0;
+ if (dictionary && btree->dictionary)
+ WT_ERR(__rec_dict_replace(session, r, 0, val));
+ __rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+
+leaf_insert: /* Write any K/V pairs inserted into the page after this key. */
+ if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
+ WT_ERR(__rec_row_leaf_insert(session, r, ins));
+ }
+
+ /* Write the remnant page. */
+ ret = __rec_split_finish(session, r);
+
+err: __wt_scr_free(&tmpkey);
+ __wt_scr_free(&tmpval);
+ return (ret);
+}
+
+/*
+ * __rec_row_leaf_insert --
+ * Walk an insert chain, writing K/V pairs.
+ */
+static int
+__rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
+{
+ WT_BTREE *btree;
+ WT_KV *key, *val;
+ WT_UPDATE *upd;
+ int ovfl_key;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+ val = &r->v;
+
+ for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
+ /* Look for an update. */
+ WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
+ if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
+ continue;
+
+ if (upd->size == 0) /* Build value cell. */
+ val->len = 0;
+ else
+ WT_RET(__rec_cell_build_val(session, r,
+ WT_UPDATE_DATA(upd), upd->size, (uint64_t)0));
+
+ /* Build key cell. */
+ WT_RET(__rec_cell_build_leaf_key(session, r,
+ WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key));
+
+ /* Boundary: split or write the page. */
+ while (key->len + val->len > r->space_avail) {
+ if (r->raw_compression) {
+ WT_RET(__rec_split_raw(session, r));
+ continue;
+ }
+ WT_RET(__rec_split(session, r));
+
+ /*
+ * Turn off prefix compression until a full key written
+ * to the new page, and (unless we're already working
+ * with an overflow key), rebuild the key without prefix
+ * compression.
+ */
+ if (r->key_pfx_compress_conf) {
+ r->key_pfx_compress = 0;
+ if (!ovfl_key)
+ WT_RET(__rec_cell_build_leaf_key(
+ session, r, NULL, 0, &ovfl_key));
+ }
+ }
+
+ /* Copy the key/value pair onto the page. */
+ __rec_copy_incr(session, r, key);
+ if (val->len == 0)
+ r->any_empty_value = 1;
+ else {
+ r->all_empty_value = 0;
+ if (btree->dictionary)
+ WT_RET(__rec_dict_replace(session, r, 0, val));
+ __rec_copy_incr(session, r, val);
+ }
+
+ /* Update compression state. */
+ __rec_key_state_update(r, ovfl_key);
+ }
+
+ return (0);
+}
+
+/*
+ * __rec_split_discard --
+ * Discard the pages resulting from a previous split.
+ */
+static int
+__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ WT_PAGE_MODIFY *mod;
+ WT_MULTI *multi;
+ uint32_t i;
+
+ bm = S2BT(session)->bm;
+ mod = page->modify;
+
+ /*
+ * A page that split is being reconciled for the second, or subsequent
+ * time; discard underlying block space used in the last reconciliation
+ * that is not being reused for this reconciliation.
+ */
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ __wt_free(session, multi->key.ikey);
+ break;
+ }
+ if (multi->skip == NULL) {
+ if (multi->addr.reuse)
+ multi->addr.addr = NULL;
+ else {
+ WT_RET(bm->free(bm, session,
+ multi->addr.addr, multi->addr.size));
+ __wt_free(session, multi->addr.addr);
+ }
+ } else {
+ __wt_free(session, multi->skip);
+ __wt_free(session, multi->skip_dsk);
+ }
+ }
+ __wt_free(session, mod->mod_multi);
+ mod->mod_multi_entries = 0;
+
+ /*
+ * This routine would be trivial, and only walk a single page freeing
+ * any blocks written to support the split, except for root splits.
+ * In the case of root splits, we have to cope with multiple pages in
+ * a linked list, and we also have to discard overflow items written
+ * for the page.
+ */
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ if (mod->mod_root_split == NULL)
+ break;
+ WT_RET(__rec_split_discard(session, mod->mod_root_split));
+ WT_RET(__wt_ovfl_track_wrapup(session, mod->mod_root_split));
+ __wt_page_out(session, &mod->mod_root_split);
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __rec_write_wrapup --
+ * Finish the reconciliation.
+ */
+static int
+__rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_BOUNDARY *bnd;
+ WT_BTREE *btree;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *ref;
+ size_t addr_size;
+ const uint8_t *addr;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ mod = page->modify;
+ ref = r->ref;
+
+ /*
+ * This page may have previously been reconciled, and that information
+ * is now about to be replaced. Make sure it's discarded at some point,
+ * and clear the underlying modification information, we're creating a
+ * new reality.
+ */
+ switch (F_ISSET(mod, WT_PM_REC_MASK)) {
+ case 0: /*
+ * The page has never been reconciled before, free the original
+ * address blocks (if any). The "if any" is for empty trees
+ * created when a new tree is opened or previously deleted pages
+ * instantiated in memory.
+ *
+ * The exception is root pages are never tracked or free'd, they
+ * are checkpoints, and must be explicitly dropped.
+ */
+ if (__wt_ref_is_root(ref))
+ break;
+ if (ref->addr != NULL) {
+ /*
+ * Free the page and clear the address (so we don't free
+ * it twice).
+ */
+ WT_RET(__wt_ref_info(
+ session, ref, &addr, &addr_size, NULL));
+ WT_RET(bm->free(bm, session, addr, addr_size));
+ if (__wt_off_page(ref->home, ref->addr)) {
+ __wt_free(
+ session, ((WT_ADDR *)ref->addr)->addr);
+ __wt_free(session, ref->addr);
+ }
+ ref->addr = NULL;
+ }
+ break;
+ case WT_PM_REC_EMPTY: /* Page deleted */
+ break;
+ case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
+ /*
+ * Discard the multiple replacement blocks.
+ */
+ WT_RET(__rec_split_discard(session, page));
+ break;
+ case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
+ /*
+ * Discard the replacement leaf page's blocks.
+ *
+ * The exception is root pages are never tracked or free'd, they
+ * are checkpoints, and must be explicitly dropped.
+ */
+ if (!__wt_ref_is_root(ref))
+ WT_RET(bm->free(bm, session,
+ mod->mod_replace.addr, mod->mod_replace.size));
+
+ /* Discard the replacement page's address. */
+ __wt_free(session, mod->mod_replace.addr);
+ mod->mod_replace.size = 0;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ F_CLR(mod, WT_PM_REC_MASK);
+
+ /*
+ * Wrap up overflow tracking. If we are about to create a checkpoint,
+ * the system must be entirely consistent at that point (the underlying
+ * block manager is presumably going to do some action to resolve the
+ * list of allocated/free/whatever blocks that are associated with the
+ * checkpoint).
+ */
+ WT_RET(__wt_ovfl_track_wrapup(session, page));
+
+ switch (r->bnd_next) {
+ case 0: /* Page delete */
+ WT_RET(__wt_verbose(
+ session, WT_VERB_RECONCILE, "page %p empty", page));
+ WT_STAT_FAST_DATA_INCR(session, rec_page_delete);
+
+ /* If this is the root page, we need to create a sync point. */
+ ref = r->ref;
+ if (__wt_ref_is_root(ref))
+ WT_RET(
+ bm->checkpoint(bm, session, NULL, btree->ckpt, 0));
+
+ /*
+ * If the page was empty, we want to discard it from the tree
+ * by discarding the parent's key when evicting the parent.
+ * Mark the page as deleted, then return success, leaving the
+ * page in memory. If the page is subsequently modified, that
+ * is OK, we'll just reconcile it again.
+ */
+ F_SET(mod, WT_PM_REC_EMPTY);
+ break;
+ case 1: /* 1-for-1 page swap */
+ /*
+ * Because WiredTiger's pages grow without splitting, we're
+ * replacing a single page with another single page most of
+ * the time.
+ */
+ bnd = &r->bnd[0];
+
+ /*
+ * If we're saving/restoring changes for this page, there's
+ * nothing to write. Allocate, then initialize the array of
+ * replacement blocks.
+ */
+ if (bnd->skip != NULL) {
+ WT_RET(__wt_calloc_def(
+ session, r->bnd_next, &mod->mod_multi));
+ multi = mod->mod_multi;
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ mod->mod_multi_entries = 1;
+
+ F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ break;
+ }
+
+ /*
+ * If this is a root page, then we don't have an address and we
+ * have to create a sync point. The address was cleared when
+ * we were about to write the buffer so we know what to do here.
+ */
+ if (bnd->addr.addr == NULL)
+ WT_RET(__wt_bt_write(session,
+ &r->dsk, NULL, NULL, 1, bnd->already_compressed));
+ else {
+ mod->mod_replace = bnd->addr;
+ bnd->addr.addr = NULL;
+ }
+
+ F_SET(mod, WT_PM_REC_REPLACE);
+ break;
+ default: /* Page split */
+ WT_RET(__wt_verbose(session, WT_VERB_RECONCILE,
+ "page %p reconciled into %" PRIu32 " pages",
+ page, r->bnd_next));
+
+ switch (page->type) {
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ WT_STAT_FAST_DATA_INCR(
+ session, rec_multiblock_internal);
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ case WT_PAGE_ROW_LEAF:
+ WT_STAT_FAST_DATA_INCR(session, rec_multiblock_leaf);
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* Display the actual split keys. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT)) {
+ WT_DECL_ITEM(tkey);
+ WT_DECL_RET;
+ uint32_t i;
+
+ if (page->type == WT_PAGE_ROW_INT ||
+ page->type == WT_PAGE_ROW_LEAF)
+ WT_RET(__wt_scr_alloc(session, 0, &tkey));
+ for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__wt_buf_set_printable(
+ session, tkey,
+ bnd->key.data, bnd->key.size));
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_SPLIT,
+ "split: starting key "
+ "%.*s",
+ (int)tkey->size,
+ (const char *)tkey->data));
+ break;
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_SPLIT,
+ "split: starting recno %" PRIu64,
+ bnd->recno));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+err: __wt_scr_free(&tkey);
+ WT_RET(ret);
+ }
+ if (r->bnd_next > r->bnd_next_max) {
+ r->bnd_next_max = r->bnd_next;
+ WT_STAT_FAST_DATA_SET(
+ session, rec_multiblock_max, r->bnd_next_max);
+ }
+
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_RET(__rec_split_row(session, r, page));
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_RET(__rec_split_col(session, r, page));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ F_SET(mod, WT_PM_REC_MULTIBLOCK);
+ break;
+ }
+
+ /*
+ * If updates were skipped, the tree isn't clean. The checkpoint call
+ * cleared the tree's modified value before calling the eviction thread,
+ * so we must explicitly reset the tree's modified flag. We insert a
+ * barrier after the change for clarity (the requirement is the value
+ * be set before a subsequent checkpoint reads it, and because the
+ * current checkpoint is waiting on this reconciliation to complete,
+ * there's no risk of that happening).
+ *
+ * Otherwise, if no updates were skipped, we have a new maximum
+ * transaction written for the page (used to decide if a clean page can
+ * be evicted). The page only might be clean; if the write generation
+ * is unchanged since reconciliation started, clear it and update cache
+ * dirty statistics, if the write generation changed, then the page has
+ * been written since we started reconciliation, it cannot be
+ * discarded.
+ */
+ if (r->leave_dirty) {
+ mod->first_dirty_txn = r->skipped_txn;
+
+ btree->modified = 1;
+ WT_FULL_BARRIER();
+ } else {
+ mod->rec_max_txn = r->max_txn;
+
+ if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0))
+ __wt_cache_dirty_decr(session, page);
+ }
+
+ return (0);
+}
+
+/*
+ * __rec_write_wrapup_err --
+ * Finish the reconciliation on error.
+ */
+static int
+__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BM *bm;
+ WT_BOUNDARY *bnd;
+ WT_DECL_RET;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ bm = S2BT(session)->bm;
+ mod = page->modify;
+
+ /*
+ * Clear the address-reused flag from the multiblock reconciliation
+ * information (otherwise we might think the backing block is being
+ * reused on a subsequent reconciliation where we want to free it).
+ */
+ if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i)
+ multi->addr.reuse = 0;
+
+ /*
+ * On error, discard blocks we've written, they're unreferenced by the
+ * tree. This is not a question of correctness, we're avoiding block
+ * leaks.
+ *
+ * Don't discard backing blocks marked for reuse, they remain part of
+ * a previous reconciliation.
+ */
+ WT_TRET(__wt_ovfl_track_wrapup_err(session, page));
+ for (bnd = r->bnd, i = 0; i < r->bnd_next; ++bnd, ++i)
+ if (bnd->addr.addr != NULL) {
+ if (bnd->addr.reuse)
+ bnd->addr.addr = NULL;
+ else {
+ WT_TRET(bm->free(bm, session,
+ bnd->addr.addr, bnd->addr.size));
+ __wt_free(session, bnd->addr.addr);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __rec_split_row --
+ * Split a row-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BOUNDARY *bnd;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ WT_REF *ref;
+ uint32_t i;
+ size_t size;
+ void *p;
+
+ mod = page->modify;
+
+ /* We never set the first page's key, grab it from the original page. */
+ ref = r->ref;
+ if (__wt_ref_is_root(ref))
+ WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1));
+ else {
+ __wt_ref_key(ref->home, ref, &p, &size);
+ WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size));
+ }
+
+ /* Allocate, then initialize the array of replacement blocks. */
+ WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+ for (multi = mod->mod_multi,
+ bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+ WT_RET(__wt_row_ikey(session, 0,
+ bnd->key.data, bnd->key.size, &multi->key.ikey));
+
+ if (bnd->skip == NULL) {
+ multi->addr = bnd->addr;
+ multi->addr.reuse = 0;
+ multi->size = bnd->size;
+ multi->cksum = bnd->cksum;
+ bnd->addr.addr = NULL;
+ } else {
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ }
+ }
+ mod->mod_multi_entries = r->bnd_next;
+
+ return (0);
+}
+
+/*
+ * __rec_split_col --
+ * Split a column-store page into a set of replacement blocks.
+ */
+static int
+__rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BOUNDARY *bnd;
+ WT_MULTI *multi;
+ WT_PAGE_MODIFY *mod;
+ uint32_t i;
+
+ mod = page->modify;
+
+ /* Allocate, then initialize the array of replacement blocks. */
+ WT_RET(__wt_calloc_def(session, r->bnd_next, &mod->mod_multi));
+
+ for (multi = mod->mod_multi,
+ bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
+ multi->key.recno = bnd->recno;
+
+ if (bnd->skip == NULL) {
+ multi->addr = bnd->addr;
+ multi->addr.reuse = 0;
+ multi->size = bnd->size;
+ multi->cksum = bnd->cksum;
+ bnd->addr.addr = NULL;
+ } else {
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ }
+ }
+ mod->mod_multi_entries = r->bnd_next;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_int_key --
+ * Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store internal page.
+ */
+static int
+__rec_cell_build_int_key(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+ WT_BTREE *btree;
+ WT_KV *key;
+
+ *is_ovflp = 0;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+
+ /* Copy the bytes into the "current" and key buffers. */
+ WT_RET(__wt_buf_set(session, r->cur, data, size));
+ WT_RET(__wt_buf_set(session, &key->buf, data, size));
+
+ /* Create an overflow object if the data won't fit. */
+ if (size > btree->maxintlitem) {
+ WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_internal);
+
+ *is_ovflp = 1;
+ return (__rec_cell_build_ovfl(
+ session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+ }
+
+ key->cell_len = __wt_cell_pack_int_key(&key->cell, key->buf.size);
+ key->len = key->cell_len + key->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_leaf_key --
+ * Process a key and return a WT_CELL structure and byte string to be
+ * stored on a row-store leaf page.
+ */
+static int
+__rec_cell_build_leaf_key(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, int *is_ovflp)
+{
+ WT_BTREE *btree;
+ WT_KV *key;
+ size_t pfx_max;
+ uint8_t pfx;
+ const uint8_t *a, *b;
+
+ *is_ovflp = 0;
+
+ btree = S2BT(session);
+
+ key = &r->k;
+
+ pfx = 0;
+ if (data == NULL)
+ /*
+ * When data is NULL, our caller has a prefix compressed key
+ * they can't use (probably because they just crossed a split
+ * point). Use the full key saved when last called, instead.
+ */
+ WT_RET(__wt_buf_set(
+ session, &key->buf, r->cur->data, r->cur->size));
+ else {
+ /*
+ * Save a copy of the key for later reference: we use the full
+ * key for prefix-compression comparisons, and if we are, for
+ * any reason, unable to use the compressed key we generate.
+ */
+ WT_RET(__wt_buf_set(session, r->cur, data, size));
+
+ /*
+ * Do prefix compression on the key. We know by definition the
+ * previous key sorts before the current key, which means the
+ * keys must differ and we just need to compare up to the
+ * shorter of the two keys.
+ */
+ if (r->key_pfx_compress) {
+ /*
+ * We can't compress out more than 256 bytes, limit the
+ * comparison to that.
+ */
+ pfx_max = UINT8_MAX;
+ if (size < pfx_max)
+ pfx_max = size;
+ if (r->last->size < pfx_max)
+ pfx_max = r->last->size;
+ for (a = data, b = r->last->data; pfx < pfx_max; ++pfx)
+ if (*a++ != *b++)
+ break;
+
+ /*
+ * Prefix compression may cost us CPU and memory when
+ * the page is re-loaded, don't do it unless there's
+ * reasonable gain.
+ */
+ if (pfx < btree->prefix_compression_min)
+ pfx = 0;
+ else
+ WT_STAT_FAST_DATA_INCRV(
+ session, rec_prefix_compression, pfx);
+ }
+
+ /* Copy the non-prefix bytes into the key buffer. */
+ WT_RET(__wt_buf_set(
+ session, &key->buf, (uint8_t *)data + pfx, size - pfx));
+ }
+
+ /* Optionally compress the key using the Huffman engine. */
+ if (btree->huffman_key != NULL)
+ WT_RET(__wt_huffman_encode(session, btree->huffman_key,
+ key->buf.data, (uint32_t)key->buf.size, &key->buf));
+
+ /* Create an overflow object if the data won't fit. */
+ if (key->buf.size > btree->maxleafitem) {
+ /*
+ * Overflow objects aren't prefix compressed -- rebuild any
+ * object that was prefix compressed.
+ */
+ if (pfx == 0) {
+ WT_STAT_FAST_DATA_INCR(session, rec_overflow_key_leaf);
+
+ *is_ovflp = 1;
+ return (__rec_cell_build_ovfl(
+ session, r, key, WT_CELL_KEY_OVFL, (uint64_t)0));
+ }
+ return (
+ __rec_cell_build_leaf_key(session, r, NULL, 0, is_ovflp));
+ }
+
+ key->cell_len = __wt_cell_pack_leaf_key(&key->cell, pfx, key->buf.size);
+ key->len = key->cell_len + key->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_addr --
+ * Process an address reference and return a cell structure to be stored
+ * on the page.
+ */
+static void
+__rec_cell_build_addr(WT_RECONCILE *r,
+ const void *addr, size_t size, u_int cell_type, uint64_t recno)
+{
+ WT_KV *val;
+
+ val = &r->v;
+
+ /*
+ * We don't check the address size because we can't store an address on
+ * an overflow page: if the address won't fit, the overflow page's
+ * address won't fit either. This possibility must be handled by Btree
+ * configuration, we have to disallow internal page sizes that are too
+ * small with respect to the largest address cookie the underlying block
+ * manager might return.
+ */
+
+ /*
+ * We don't copy the data into the buffer, it's not necessary; just
+ * re-point the buffer's data/length fields.
+ */
+ val->buf.data = addr;
+ val->buf.size = size;
+ val->cell_len =
+ __wt_cell_pack_addr(&val->cell, cell_type, recno, val->buf.size);
+ val->len = val->cell_len + val->buf.size;
+}
+
+/*
+ * __rec_cell_build_val --
+ * Process a data item and return a WT_CELL structure and byte string to
+ * be stored on the page.
+ */
+static int
+__rec_cell_build_val(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, const void *data, size_t size, uint64_t rle)
+{
+ WT_BTREE *btree;
+ WT_KV *val;
+
+ btree = S2BT(session);
+
+ val = &r->v;
+
+ /*
+ * We don't copy the data into the buffer, it's not necessary; just
+ * re-point the buffer's data/length fields.
+ */
+ val->buf.data = data;
+ val->buf.size = size;
+
+ /* Handle zero-length cells quickly. */
+ if (size != 0) {
+ /* Optionally compress the data using the Huffman engine. */
+ if (btree->huffman_value != NULL)
+ WT_RET(__wt_huffman_encode(
+ session, btree->huffman_value,
+ val->buf.data, (uint32_t)val->buf.size, &val->buf));
+
+ /* Create an overflow object if the data won't fit. */
+ if (val->buf.size > btree->maxleafitem) {
+ WT_STAT_FAST_DATA_INCR(session, rec_overflow_value);
+
+ return (__rec_cell_build_ovfl(
+ session, r, val, WT_CELL_VALUE_OVFL, rle));
+ }
+ }
+ val->cell_len = __wt_cell_pack_data(&val->cell, rle, val->buf.size);
+ val->len = val->cell_len + val->buf.size;
+
+ return (0);
+}
+
+/*
+ * __rec_cell_build_ovfl --
+ * Store overflow items in the file, returning the address cookie.
+ */
+static int
+__rec_cell_build_ovfl(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_KV *kv, uint8_t type, uint64_t rle)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_PAGE_HEADER *dsk;
+ size_t size;
+ uint8_t *addr, buf[WT_BTREE_MAX_ADDR_COOKIE];
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ page = r->page;
+
+ /* Track if page has overflow items. */
+ r->ovfl_items = 1;
+
+ /*
+ * See if this overflow record has already been written and reuse it if
+ * possible. Else, write a new overflow record.
+ */
+ if (!__wt_ovfl_reuse_search(session, page,
+ &addr, &size, kv->buf.data, kv->buf.size)) {
+ /* Allocate a buffer big enough to write the overflow record. */
+ size = kv->buf.size;
+ WT_RET(bm->write_size(bm, session, &size));
+ WT_RET(__wt_scr_alloc(session, size, &tmp));
+
+ /* Initialize the buffer: disk header and overflow record. */
+ dsk = tmp->mem;
+ memset(dsk, 0, WT_PAGE_HEADER_SIZE);
+ dsk->type = WT_PAGE_OVFL;
+ dsk->u.datalen = (uint32_t)kv->buf.size;
+ memcpy(WT_PAGE_HEADER_BYTE(btree, dsk),
+ kv->buf.data, kv->buf.size);
+ dsk->mem_size = tmp->size =
+ WT_PAGE_HEADER_BYTE_SIZE(btree) + (uint32_t)kv->buf.size;
+
+ /* Write the buffer. */
+ addr = buf;
+ WT_ERR(__wt_bt_write(session, tmp, addr, &size, 0, 0));
+
+ /*
+ * Track the overflow record (unless it's a bulk load, which
+ * by definition won't ever reuse a record.
+ */
+ if (!r->is_bulk_load)
+ WT_ERR(__wt_ovfl_reuse_add(session, page,
+ addr, size, kv->buf.data, kv->buf.size));
+ }
+
+ /* Set the callers K/V to reference the overflow record's address. */
+ WT_ERR(__wt_buf_set(session, &kv->buf, addr, size));
+
+ /* Build the cell and return. */
+ kv->cell_len = __wt_cell_pack_ovfl(&kv->cell, type, rle, kv->buf.size);
+ kv->len = kv->cell_len + kv->buf.size;
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * The dictionary --
+ * The rest of this file is support for dictionaries.
+ *
+ * It's difficult to write generic skiplist functions without turning a single
+ * memory allocation into two, or requiring a function call instead of a simple
+ * comparison. Fortunately, skiplists are relatively simple things and we can
+ * include them in-place. If you need generic skip-list functions to modify,
+ * this set wouldn't be a bad place to start.
+ *
+ * __rec_dictionary_skip_search --
+ * Search a dictionary skiplist.
+ */
+static WT_DICTIONARY *
+__rec_dictionary_skip_search(WT_DICTIONARY **head, uint64_t hash)
+{
+ WT_DICTIONARY **e;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
+ if (*e == NULL) { /* Empty levels */
+ --i;
+ --e;
+ continue;
+ }
+
+ /*
+ * Return any exact matches: we don't care in what search level
+ * we found a match.
+ */
+ if ((*e)->hash == hash) /* Exact match */
+ return (*e);
+ if ((*e)->hash > hash) { /* Drop down a level */
+ --i;
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ }
+ return (NULL);
+}
+
+/*
+ * __rec_dictionary_skip_search_stack --
+ * Search a dictionary skiplist, returning an insert/remove stack.
+ */
+static void
+__rec_dictionary_skip_search_stack(
+ WT_DICTIONARY **head, WT_DICTIONARY ***stack, uint64_t hash)
+{
+ WT_DICTIONARY **e;
+ int i;
+
+ /*
+ * Start at the highest skip level, then go as far as possible at each
+ * level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;)
+ if (*e == NULL || (*e)->hash > hash)
+ stack[i--] = e--; /* Drop down a level */
+ else
+ e = &(*e)->next[i]; /* Keep going at this level */
+}
+
+/*
+ * __rec_dictionary_skip_insert --
+ * Insert an entry into the dictionary skip-list.
+ */
+static void
+__rec_dictionary_skip_insert(
+ WT_DICTIONARY **head, WT_DICTIONARY *e, uint64_t hash)
+{
+ WT_DICTIONARY **stack[WT_SKIP_MAXDEPTH];
+ u_int i;
+
+ /* Insert the new entry into the skiplist. */
+ __rec_dictionary_skip_search_stack(head, stack, hash);
+ for (i = 0; i < e->depth; ++i) {
+ e->next[i] = *stack[i];
+ *stack[i] = e;
+ }
+}
+
+/*
+ * __rec_dictionary_init --
+ * Allocate and initialize the dictionary.
+ */
+static int
+__rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots)
+{
+ u_int depth, i;
+
+ /* Free any previous dictionary. */
+ __rec_dictionary_free(session, r);
+
+ r->dictionary_slots = slots;
+ WT_RET(__wt_calloc(session,
+ r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary));
+ for (i = 0; i < r->dictionary_slots; ++i) {
+ depth = __wt_skip_choose_depth(session);
+ WT_RET(__wt_calloc(session, 1,
+ sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *),
+ &r->dictionary[i]));
+ r->dictionary[i]->depth = depth;
+ }
+ return (0);
+}
+
+/*
+ * __rec_dictionary_free --
+ * Free the dictionary.
+ */
+static void
+__rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ u_int i;
+
+ if (r->dictionary == NULL)
+ return;
+
+ /*
+ * We don't correct dictionary_slots when we fail during allocation,
+ * but that's OK, the value is either NULL or a memory reference to
+ * be free'd.
+ */
+ for (i = 0; i < r->dictionary_slots; ++i)
+ __wt_free(session, r->dictionary[i]);
+ __wt_free(session, r->dictionary);
+}
+
+/*
+ * __rec_dictionary_reset --
+ * Reset the dictionary when reconciliation restarts and when crossing a
+ * page boundary (a potential split).
+ */
+static void
+__rec_dictionary_reset(WT_RECONCILE *r)
+{
+ if (r->dictionary_slots) {
+ r->dictionary_next = 0;
+ memset(r->dictionary_head, 0, sizeof(r->dictionary_head));
+ }
+}
+
+/*
+ * __rec_dictionary_lookup --
+ * Check the dictionary for a matching value on this page.
+ */
+static int
+__rec_dictionary_lookup(
+ WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *val, WT_DICTIONARY **dpp)
+{
+ WT_DICTIONARY *dp, *next;
+ uint64_t hash;
+ int match;
+
+ *dpp = NULL;
+
+ /* Search the dictionary, and return any match we find. */
+ hash = __wt_hash_fnv64(val->buf.data, val->buf.size);
+ for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash);
+ dp != NULL && dp->hash == hash; dp = dp->next[0]) {
+ WT_RET(__wt_cell_pack_data_match(
+ dp->cell, &val->cell, val->buf.data, &match));
+ if (match) {
+ WT_STAT_FAST_DATA_INCR(session, rec_dictionary);
+ *dpp = dp;
+ return (0);
+ }
+ }
+
+ /*
+ * We're not doing value replacement in the dictionary. We stop adding
+ * new entries if we run out of empty dictionary slots (but continue to
+ * use the existing entries). I can't think of any reason a leaf page
+ * value is more likely to be seen because it was seen more recently
+ * than some other value: if we find working sets where that's not the
+ * case, it shouldn't be too difficult to maintain a pointer which is
+ * the next dictionary slot to re-use.
+ */
+ if (r->dictionary_next >= r->dictionary_slots)
+ return (0);
+
+ /*
+ * Set the hash value, we'll add this entry into the dictionary when we
+ * write it into the page's disk image buffer (because that's when we
+ * know where on the page it will be written).
+ */
+ next = r->dictionary[r->dictionary_next++];
+ next->cell = NULL; /* Not necessary, just cautious. */
+ next->hash = hash;
+ __rec_dictionary_skip_insert(r->dictionary_head, next, hash);
+ *dpp = next;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c
new file mode 100644
index 00000000000..308bc1f0dc5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_key.c
@@ -0,0 +1,500 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);
+
+/*
+ * __wt_row_leaf_keys --
+ * Instantiate the interesting keys for random search of a page.
+ */
+int
+__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(key);
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_ROW *rip;
+ uint32_t gap, i;
+
+ btree = S2BT(session);
+
+ if (page->pg_row_entries == 0) { /* Just checking... */
+ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+ return (0);
+ }
+
+ /*
+ * Row-store leaf pages are written as one big prefix-compressed chunk,
+ * that is, only the first key on the page is not prefix-compressed, and
+ * to instantiate the last key on the page, you have to take the first
+ * key on the page and roll it forward to the end of the page. We don't
+ * want to do that on every page access, of course, so we instantiate a
+ * set of keys, essentially creating prefix chunks on the page, where we
+ * can roll forward from the closest, previous, instantiated key. The
+ * complication is that not all keys on a page are equal: we're doing a
+ * binary search on the page, which means there are keys we look at a
+ * lot (every time we search the page), and keys we never look at unless
+ * they are actually being searched for. This function figures out the
+ * "interesting" keys on a page, and then we sequentially walk that list
+ * instantiating those keys.
+ *
+ * Allocate a bit array and figure out the set of "interesting" keys,
+ * marking up the array.
+ */
+ WT_RET(__wt_scr_alloc(session, 0, &key));
+ WT_RET(__wt_scr_alloc(session,
+ (uint32_t)__bitstr_size(page->pg_row_entries), &tmp));
+
+ if ((gap = btree->key_gap) == 0)
+ gap = 1;
+ __inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap);
+
+ /* Instantiate the keys. */
+ for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i)
+ if (__bit_test(tmp->mem, i))
+ WT_ERR(__wt_row_leaf_key_work(
+ session, page, rip, key, 1));
+
+ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
+
+err: __wt_scr_free(&key);
+ __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __inmem_row_leaf_slots --
+ * Figure out the interesting slots of a page for random search, up to
+ * the specified depth.
+ */
+static void
+__inmem_row_leaf_slots(
+ uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
+{
+ uint32_t indx, limit;
+
+ if (entries < gap)
+ return;
+
+ /*
+ * !!!
+ * Don't clean this code up -- it deliberately looks like the binary
+ * search code.
+ *
+ * !!!
+ * There's got to be a function that would give me this information, but
+ * I don't see any performance reason we can't just do this recursively.
+ */
+ limit = entries;
+ indx = base + (limit >> 1);
+ __bit_set(list, indx);
+
+ __inmem_row_leaf_slots(list, base, limit >> 1, gap);
+
+ base = indx + 1;
+ --limit;
+ __inmem_row_leaf_slots(list, base, limit >> 1, gap);
+}
+
+/*
+ * __wt_row_leaf_key_copy --
+ * Get a copy of a row-store leaf-page key.
+ */
+int
+__wt_row_leaf_key_copy(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key)
+{
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+
+ /* The return buffer may only hold a reference to a key, copy it. */
+ if (!WT_DATA_IN_ITEM(key))
+ WT_RET(__wt_buf_set(session, key, key->data, key->size));
+
+ return (0);
+}
+
+/*
+ * __wt_row_leaf_key_work --
+ * Return a reference to, a row-store leaf-page key, optionally instantiate
+ * the key into the in-memory page.
+ */
+int
+__wt_row_leaf_key_work(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate)
+{
+ enum { FORWARD, BACKWARD } direction;
+ WT_BTREE *btree;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_IKEY *ikey;
+ WT_ROW *rip, *jump_rip;
+ size_t size;
+ u_int last_prefix;
+ int jump_slot_offset, slot_offset;
+ void *copy;
+ const void *p;
+
+ /*
+ * !!!
+ * It is unusual to call this function: most code should be calling the
+ * front-end, __wt_row_leaf_key, be careful if you're calling this code
+ * directly.
+ */
+
+ btree = S2BT(session);
+ unpack = &_unpack;
+ rip = rip_arg;
+
+ jump_rip = NULL;
+ jump_slot_offset = 0;
+ last_prefix = 0;
+
+ p = NULL; /* -Werror=maybe-uninitialized */
+ size = 0; /* -Werror=maybe-uninitialized */
+
+ direction = BACKWARD;
+ for (slot_offset = 0;;) {
+ if (0) {
+switch_and_jump: /* Switching to a forward roll. */
+ WT_ASSERT(session, direction == BACKWARD);
+ direction = FORWARD;
+
+ /* Skip list of keys with compatible prefixes. */
+ rip = jump_rip;
+ slot_offset = jump_slot_offset;
+ }
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * Figure out what the key looks like.
+ */
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, &p, &size);
+
+ /* 1: the test for a directly referenced on-page key. */
+ if (cell == NULL) {
+ keyb->data = p;
+ keyb->size = size;
+
+ /*
+ * If this is the key we originally wanted, we don't
+ * care if we're rolling forward or backward, or if
+ * it's an overflow key or not, it's what we wanted.
+ * This shouldn't normally happen, the fast-path code
+ * that front-ends this function will have figured it
+ * out before we were called.
+ *
+ * The key doesn't need to be instantiated, skip past
+ * that test.
+ */
+ if (slot_offset == 0)
+ goto done;
+
+ /*
+ * This key is not an overflow key by definition and
+ * isn't compressed in any way, we can use it to roll
+ * forward.
+ * If rolling backward, switch directions.
+ * If rolling forward: there's a bug somewhere,
+ * we should have hit this key when rolling backward.
+ */
+ goto switch_and_jump;
+ }
+
+ /* 2: the test for an instantiated off-page key. */
+ if (ikey != NULL) {
+ /*
+ * If this is the key we originally wanted, we don't
+ * care if we're rolling forward or backward, or if
+ * it's an overflow key or not, it's what we wanted.
+ * Take a copy and wrap up.
+ *
+ * The key doesn't need to be instantiated, skip past
+ * that test.
+ */
+ if (slot_offset == 0) {
+ keyb->data = p;
+ keyb->size = size;
+ goto done;
+ }
+
+ /*
+ * If we wanted a different key and this key is an
+ * overflow key:
+ * If we're rolling backward, this key is useless
+ * to us because it doesn't have a valid prefix: keep
+ * rolling backward.
+ * If we're rolling forward, there's no work to be
+ * done because prefixes skip overflow keys: keep
+ * rolling forward.
+ */
+ if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
+ goto next;
+
+ /*
+ * If we wanted a different key and this key is not an
+ * overflow key, it has a valid prefix, we can use it.
+ * If rolling backward, take a copy of the key and
+ * switch directions, we can roll forward from this key.
+ * If rolling forward, replace the key we've been
+ * building with this key, it's what we would have built
+ * anyway.
+ * In short: if it's not an overflow key, take a copy
+ * and roll forward.
+ */
+ keyb->data = p;
+ keyb->size = size;
+ direction = FORWARD;
+ goto next;
+ }
+
+ /*
+ * It must be an on-page cell, unpack it.
+ */
+ __wt_cell_unpack(cell, unpack);
+
+ /* 3: the test for an on-page reference to an overflow key. */
+ if (unpack->type == WT_CELL_KEY_OVFL) {
+ /*
+ * If this is the key we wanted from the start, we don't
+ * care if it's an overflow key, get a copy and wrap up.
+ *
+ * Avoid racing with reconciliation deleting overflow
+ * keys. Deleted overflow keys must be instantiated
+ * first, acquire the overflow lock and check. Read
+ * the key if we still need to do so, but holding the
+ * overflow lock. Note we are not using the version of
+ * the cell-data-ref calls that acquire the overflow
+ * lock and do a look-aside into the tracking cache:
+ * this is an overflow key, not a value, meaning it's
+ * instantiated before being deleted, not copied into
+ * the tracking cache.
+ */
+ if (slot_offset == 0) {
+ WT_ERR(
+ __wt_readlock(session, btree->ovfl_lock));
+ copy = WT_ROW_KEY_COPY(rip);
+ if (!__wt_row_leaf_key_info(page, copy,
+ NULL, &cell, &keyb->data, &keyb->size)) {
+ __wt_cell_unpack(cell, unpack);
+ ret = __wt_dsk_cell_data_ref(session,
+ WT_PAGE_ROW_LEAF, unpack, keyb);
+ }
+ WT_TRET(
+ __wt_readunlock(session, btree->ovfl_lock));
+ WT_ERR(ret);
+ break;
+ }
+
+ /*
+ * If we wanted a different key:
+ * If we're rolling backward, this key is useless
+ * to us because it doesn't have a valid prefix: keep
+ * rolling backward.
+ * If we're rolling forward, there's no work to be
+ * done because prefixes skip overflow keys: keep
+ * rolling forward.
+ */
+ goto next;
+ }
+
+ /*
+ * 4: the test for an on-page reference to a key that isn't
+ * prefix compressed.
+ */
+ if (unpack->prefix == 0) {
+ /*
+ * The only reason to be here is a Huffman encoded key,
+ * a non-encoded key with no prefix compression should
+ * have been directly referenced, and we should not have
+ * needed to unpack its cell.
+ */
+ WT_ASSERT(session, btree->huffman_key != NULL);
+
+ /*
+ * If this is the key we originally wanted, we don't
+ * care if we're rolling forward or backward, it's
+ * what we want. Take a copy and wrap up.
+ *
+ * If we wanted a different key, this key has a valid
+ * prefix, we can use it.
+ * If rolling backward, take a copy of the key and
+ * switch directions, we can roll forward from this key.
+ * If rolling forward there's a bug, we should have
+ * found this key while rolling backwards and switched
+ * directions then.
+ *
+ * The key doesn't need to be instantiated, skip past
+ * that test.
+ */
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_LEAF, unpack, keyb));
+ if (slot_offset == 0)
+ goto done;
+ goto switch_and_jump;
+ }
+
+ /*
+ * 5: an on-page reference to a key that's prefix compressed.
+ * If rolling backward, keep looking for something we can
+ * use.
+ * If rolling forward, build the full key and keep rolling
+ * forward.
+ */
+ if (direction == BACKWARD) {
+ /*
+ * If there's a set of keys with identical prefixes, we
+ * don't want to instantiate each one, the prefixes are
+ * all the same.
+ *
+ * As we roll backward through the page, track the last
+ * time the prefix decreased in size, so we can start
+ * with that key during our roll-forward. For a page
+ * populated with a single key prefix, we'll be able to
+ * instantiate the key we want as soon as we find a key
+ * without a prefix.
+ */
+ if (slot_offset == 0)
+ last_prefix = unpack->prefix;
+ if (slot_offset == 0 || last_prefix > unpack->prefix) {
+ jump_rip = rip;
+ jump_slot_offset = slot_offset;
+ last_prefix = unpack->prefix;
+ }
+ }
+ if (direction == FORWARD) {
+ /*
+ * Get a reference to the current key's bytes. Usually
+ * we want bytes from the page, fast-path that case.
+ */
+ if (btree->huffman_key == NULL) {
+ p = unpack->data;
+ size = unpack->size;
+ } else {
+ if (tmp == NULL)
+ WT_ERR(
+ __wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_dsk_cell_data_ref(
+ session, WT_PAGE_ROW_LEAF, unpack, tmp));
+ p = tmp->data;
+ size = tmp->size;
+ }
+
+ /*
+ * Grow the buffer as necessary as well as ensure data
+ * has been copied into local buffer space, then append
+ * the suffix to the prefix already in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy data we
+ * don't need, truncate the item's data length to the
+ * prefix bytes.
+ */
+ keyb->size = unpack->prefix;
+ WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
+ memcpy((uint8_t *)keyb->data + keyb->size, p, size);
+ keyb->size += size;
+
+ if (slot_offset == 0)
+ break;
+ }
+
+next: switch (direction) {
+ case BACKWARD:
+ --rip;
+ ++slot_offset;
+ break;
+ case FORWARD:
+ ++rip;
+ --slot_offset;
+ break;
+ }
+ }
+
+ /*
+ * Optionally instantiate the key: there's a cost to figuring out a key
+ * value in a leaf page with prefix-compressed or Huffman encoded keys,
+ * amortize the cost by instantiating a copy of the calculated key in
+ * allocated memory. We don't instantiate keys when pages are first
+ * brought into memory because it's wasted effort if the page is only
+ * read by a cursor in sorted order. If, instead, the page is read by a
+ * cursor in reverse order, we immediately instantiate periodic keys for
+ * the page (otherwise the reverse walk would be insanely slow). If,
+ * instead, the page is randomly searched, we instantiate keys as they
+ * are accessed (meaning, for example, as long as the binary search only
+ * touches one-half of the page, the only keys we instantiate will be in
+ * that half of the page).
+ */
+ if (instantiate) {
+ copy = WT_ROW_KEY_COPY(rip_arg);
+ (void)__wt_row_leaf_key_info(
+ page, copy, &ikey, &cell, NULL, NULL);
+ if (ikey == NULL) {
+ WT_ERR(__wt_row_ikey(session,
+ WT_PAGE_DISK_OFFSET(page, cell),
+ keyb->data, keyb->size, &ikey));
+
+ /*
+ * Serialize the swap of the key into place: on success,
+ * update the page's memory footprint, on failure, free
+ * the allocated memory.
+ */
+ if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey))
+ __wt_cache_page_inmem_incr(session,
+ page, sizeof(WT_IKEY) + ikey->size);
+ else
+ __wt_free(session, ikey);
+ }
+ }
+
+done:
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_row_ikey_incr --
+ * Instantiate a key in a WT_IKEY structure and increment the page's
+ * memory footprint.
+ */
+int
+__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
+ uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+{
+ WT_RET(__wt_row_ikey(session, cell_offset, key, size, ikeyp));
+
+ __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);
+
+ return (0);
+}
+
+/*
+ * __wt_row_ikey --
+ * Instantiate a key in a WT_IKEY structure.
+ */
+int
+__wt_row_ikey(WT_SESSION_IMPL *session,
+ uint32_t cell_offset, const void *key, size_t size, void *ikeyp)
+{
+ WT_IKEY *ikey;
+
+ /*
+ * Allocate memory for the WT_IKEY structure and the key, then copy
+ * the key into place.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
+ ikey->size = WT_STORE_SIZE(size);
+ ikey->cell_offset = cell_offset;
+ memcpy(WT_IKEY_DATA(ikey), key, size);
+
+ *(WT_IKEY **)ikeyp = ikey;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
new file mode 100644
index 00000000000..e0036d14cbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -0,0 +1,346 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_modify_alloc --
+ * Allocate a page's modification structure.
+ */
+int
+__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_PAGE_MODIFY *modify;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_calloc_def(session, 1, &modify));
+
+ /*
+ * Select a spinlock for the page; let the barrier immediately below
+ * keep things from racing too badly.
+ */
+ modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn);
+
+ /*
+ * Multiple threads of control may be searching and deciding to modify
+ * a page. If our modify structure is used, update the page's memory
+ * footprint, else discard the modify structure, another thread did the
+ * work.
+ */
+ if (WT_ATOMIC_CAS8(page->modify, NULL, modify))
+ __wt_cache_page_inmem_incr(session, page, sizeof(*modify));
+ else
+ __wt_free(session, modify);
+ return (0);
+}
+
+/*
+ * __wt_row_modify --
+ * Row-store insert, update and delete.
+ */
+int
+__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head, **ins_headp;
+ WT_PAGE *page;
+ WT_UPDATE *old_upd, **upd_entry;
+ size_t ins_size, upd_size;
+ uint32_t ins_slot;
+ u_int i, skipdepth;
+ int logged;
+
+ ins = NULL;
+ page = cbt->ref->page;
+ logged = 0;
+
+ /* This code expects a remove to have a NULL value. */
+ if (is_remove)
+ value = NULL;
+
+ /* If we don't yet have a modify structure, we'll need one. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ /*
+ * Modify: allocate an update array as necessary, build a WT_UPDATE
+ * structure, and call a serialized function to insert the WT_UPDATE
+ * structure.
+ *
+ * Insert: allocate an insert array as necessary, build a WT_INSERT
+ * and WT_UPDATE structure pair, and call a serialized function to
+ * insert the WT_INSERT structure.
+ */
+ if (cbt->compare == 0) {
+ if (cbt->ins == NULL) {
+ /* Allocate an update array as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page,
+ page->pg_row_upd, upd_entry, page->pg_row_entries);
+
+ /* Set the WT_UPDATE array reference. */
+ upd_entry = &page->pg_row_upd[cbt->slot];
+ } else
+ upd_entry = &cbt->ins->upd;
+
+ if (upd == NULL) {
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = *upd_entry));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid WT_CURSOR.update data copy. */
+ cbt->modify_update = upd;
+ } else {
+ upd_size = sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ /*
+ * We are restoring updates that couldn't be evicted,
+ * there should only be one update list per key.
+ */
+ WT_ASSERT(session, *upd_entry == NULL);
+ /*
+ * Set the "old" entry to the second update in the list
+ * so that the serialization function succeeds in
+ * swapping the first update into place.
+ */
+ old_upd = *upd_entry = upd->next;
+ }
+
+ /*
+ * Point the new WT_UPDATE item to the next element in the list.
+ * If we get it right, the serialization function lock acts as
+ * our memory barrier to flush this write.
+ */
+ upd->next = old_upd;
+
+ /* Serialize the update. */
+ WT_ERR(__wt_update_serial(
+ session, page, upd_entry, &upd, upd_size));
+ } else {
+ /*
+ * Allocate the insert array as necessary.
+ *
+ * We allocate an additional insert array slot for insert keys
+ * sorting less than any key on the page. The test to select
+ * that slot is baroque: if the search returned the first page
+ * slot, we didn't end up processing an insert list, and the
+ * comparison value indicates the search key was smaller than
+ * the returned slot, then we're using the smallest-key insert
+ * slot. That's hard, so we set a flag.
+ */
+ WT_PAGE_ALLOC_AND_SWAP(session, page,
+ page->pg_row_ins, ins_headp, page->pg_row_entries + 1);
+
+ ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ?
+ page->pg_row_entries: cbt->slot;
+ ins_headp = &page->pg_row_ins[ins_slot];
+
+ /* Allocate the WT_INSERT_HEAD structure as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+ ins_head = *ins_headp;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
+ */
+ WT_ERR(__wt_row_insert_alloc(
+ session, key, skipdepth, &ins, &ins_size));
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid WT_CURSOR.update data copy. */
+ cbt->modify_update = upd;
+ } else
+ upd_size = sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ ins->upd = upd;
+ ins_size += upd_size;
+
+ /*
+ * If there was no insert list during the search, the cursor's
+ * information cannot be correct, search couldn't have
+ * initialized it.
+ *
+ * Otherwise, point the new WT_INSERT item's skiplist to the
+ * next elements in the insert list (which we will check are
+ * still valid inside the serialization function).
+ *
+ * The serial mutex acts as our memory barrier to flush these
+ * writes before inserting them into the list.
+ */
+ if (WT_SKIP_FIRST(ins_head) == NULL)
+ for (i = 0; i < skipdepth; i++) {
+ cbt->ins_stack[i] = &ins_head->head[i];
+ ins->next[i] = cbt->next_stack[i] = NULL;
+ }
+ else
+ for (i = 0; i < skipdepth; i++)
+ ins->next[i] = cbt->next_stack[i];
+
+ /* Insert the WT_INSERT structure. */
+ WT_ERR(__wt_insert_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, skipdepth));
+ }
+
+ if (logged)
+ WT_ERR(__wt_txn_log_op(session, cbt));
+
+ if (0) {
+err: /*
+ * Remove the update from the current transaction, so we don't
+ * try to modify it on rollback.
+ */
+ if (logged)
+ __wt_txn_unmodify(session);
+ __wt_free(session, ins);
+ cbt->ins = NULL;
+ __wt_free(session, upd);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_row_insert_alloc --
+ * Row-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+int
+__wt_row_insert_alloc(WT_SESSION_IMPL *session,
+ WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+ WT_INSERT *ins;
+ size_t ins_size;
+
+ /*
+ * Allocate the WT_INSERT structure, next pointers for the skip list,
+ * and room for the key. Then copy the key into place.
+ */
+ ins_size = sizeof(WT_INSERT) +
+ skipdepth * sizeof(WT_INSERT *) + key->size;
+ WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+ ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size);
+ WT_INSERT_KEY_SIZE(ins) = WT_STORE_SIZE(key->size);
+ memcpy(WT_INSERT_KEY(ins), key->data, key->size);
+
+ *insp = ins;
+ if (ins_sizep != NULL)
+ *ins_sizep = ins_size;
+ return (0);
+}
+
+/*
+ * __wt_update_alloc --
+ * Allocate a WT_UPDATE structure and associated value and fill it in.
+ */
+int
+__wt_update_alloc(
+ WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
+{
+ WT_UPDATE *upd;
+ size_t size;
+
+ /*
+ * Allocate the WT_UPDATE structure and room for the value, then copy
+ * the value into place.
+ */
+ size = value == NULL ? 0 : value->size;
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
+ if (value == NULL)
+ WT_UPDATE_DELETED_SET(upd);
+ else {
+ upd->size = WT_STORE_SIZE(size);
+ memcpy(WT_UPDATE_DATA(upd), value->data, size);
+ }
+
+ *updp = upd;
+ *sizep = sizeof(WT_UPDATE) + size;
+ return (0);
+}
+
+/*
+ * __wt_update_obsolete_check --
+ * Check for obsolete updates.
+ */
+WT_UPDATE *
+__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_UPDATE *first, *next;
+
+ /*
+ * This function identifies obsolete updates, and truncates them from
+ * the rest of the chain; because this routine is called from inside
+ * a serialization function, the caller has responsibility for actually
+ * freeing the memory.
+ *
+ * Walk the list of updates, looking for obsolete updates at the end.
+ */
+ for (first = NULL; upd != NULL; upd = upd->next)
+ if (__wt_txn_visible_all(session, upd->txnid)) {
+ if (first == NULL)
+ first = upd;
+ } else if (upd->txnid != WT_TXN_ABORTED)
+ first = NULL;
+
+ /*
+ * We cannot discard this WT_UPDATE structure, we can only discard
+ * WT_UPDATE structures subsequent to it, other threads of control will
+ * terminate their walk in this element. Save a reference to the list
+ * we will discard, and terminate the list.
+ */
+ if (first != NULL &&
+ (next = first->next) != NULL &&
+ WT_ATOMIC_CAS8(first->next, next, NULL))
+ return (next);
+
+ return (NULL);
+}
+
+/*
+ * __wt_update_obsolete_free --
+ * Free an obsolete update list.
+ */
+void
+__wt_update_obsolete_free(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd)
+{
+ WT_UPDATE *next;
+ size_t size;
+
+ /* Free a WT_UPDATE list. */
+ for (size = 0; upd != NULL; upd = next) {
+ /* Deleted items have a dummy size: don't include that. */
+ size += sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ next = upd->next;
+ __wt_free(session, upd);
+ }
+ if (size != 0)
+ __wt_cache_page_inmem_decr(session, page, size);
+}
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
new file mode 100644
index 00000000000..b190aaaded5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -0,0 +1,553 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_search_insert_append --
+ * Fast append search of a row-store insert list, creating a skiplist stack
+ * as we go.
+ */
+static inline int
+__wt_search_insert_append(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, int *donep)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *inshead;
+ WT_ITEM key;
+ int cmp, i;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ *donep = 0;
+
+ inshead = cbt->ins_head;
+ if ((ins = WT_SKIP_LAST(inshead)) == NULL)
+ return (0);
+ key.data = WT_INSERT_KEY(ins);
+ key.size = WT_INSERT_KEY_SIZE(ins);
+
+ WT_RET(__wt_compare(session, collator, srch_key, &key, &cmp));
+ if (cmp >= 0) {
+ /*
+ * !!!
+ * We may race with another appending thread.
+ *
+ * To catch that case, rely on the atomic pointer read above
+ * and set the next stack to NULL here. If we have raced with
+ * another thread, one of the next pointers will not be NULL by
+ * the time they are checked against the next stack inside the
+ * serialized insert function.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) {
+ cbt->ins_stack[i] = (i == 0) ? &ins->next[0] :
+ (inshead->tail[i] != NULL) ?
+ &inshead->tail[i]->next[i] : &inshead->head[i];
+ cbt->next_stack[i] = NULL;
+ }
+ cbt->compare = -cmp;
+ cbt->ins = ins;
+ *donep = 1;
+ }
+ return (0);
+}
+
+/*
+ * __wt_search_insert --
+ * Search a row-store insert list, creating a skiplist stack as we go.
+ */
+int
+__wt_search_insert(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_INSERT *ins, **insp, *last_ins;
+ WT_INSERT_HEAD *inshead;
+ WT_ITEM key;
+ size_t match, skiphigh, skiplow;
+ int cmp, i;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ inshead = cbt->ins_head;
+ cmp = 0; /* -Wuninitialized */
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ match = skiphigh = skiplow = 0;
+ ins = last_ins = NULL;
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) {
+ if ((ins = *insp) == NULL) {
+ cbt->next_stack[i] = NULL;
+ cbt->ins_stack[i--] = insp--;
+ continue;
+ }
+
+ /*
+ * Comparisons may be repeated as we drop down skiplist levels;
+ * don't repeat comparisons, they might be expensive.
+ */
+ if (ins != last_ins) {
+ last_ins = ins;
+ key.data = WT_INSERT_KEY(ins);
+ key.size = WT_INSERT_KEY_SIZE(ins);
+ match = WT_MIN(skiplow, skiphigh);
+ WT_RET(__wt_compare_skip(
+ session, collator, srch_key, &key, &cmp, &match));
+ }
+
+ if (cmp > 0) { /* Keep going at this level */
+ insp = &ins->next[i];
+ skiplow = match;
+ } else if (cmp < 0) { /* Drop down a level */
+ cbt->next_stack[i] = ins;
+ cbt->ins_stack[i--] = insp--;
+ skiphigh = match;
+ } else
+ for (; i >= 0; i--) {
+ cbt->next_stack[i] = ins->next[i];
+ cbt->ins_stack[i] = &ins->next[i];
+ }
+ }
+
+ /*
+ * For every insert element we review, we're getting closer to a better
+ * choice; update the compare field to its new value. If we went past
+ * the last item in the list, return the last one: that is used to
+ * decide whether we are positioned in a skiplist.
+ */
+ cbt->compare = -cmp;
+ cbt->ins = (ins != NULL) ? ins : last_ins;
+ return (0);
+}
+
+/*
+ * __wt_row_search --
+ * Search a row-store tree for a specific key.
+ */
+int
+__wt_row_search(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert)
+{
+ WT_BTREE *btree;
+ WT_COLLATOR *collator;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+ WT_ROW *rip;
+ size_t match, skiphigh, skiplow;
+ uint32_t base, indx, limit;
+ int append_check, cmp, depth, descend_right, done;
+
+ btree = S2BT(session);
+ collator = btree->collator;
+ item = &cbt->search_key;
+
+ __cursor_pos_clear(cbt);
+
+ /*
+ * The row-store search routine uses a different comparison API.
+ * The assumption is we're comparing more than a few keys with
+ * matching prefixes, and it's a win to avoid the memory fetches
+ * by skipping over those prefixes. That's done by tracking the
+ * length of the prefix match for the lowest and highest keys we
+ * compare as we descend the tree.
+ */
+ skiphigh = skiplow = 0;
+
+ /*
+ * If a cursor repeatedly appends to the tree, compare the search key
+ * against the last key on each internal page during insert before
+ * doing the full binary search.
+ *
+ * Track if the descent is to the right-side of the tree, used to set
+ * the cursor's append history.
+ */
+ append_check = insert && cbt->append_tree;
+ descend_right = 1;
+
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf != NULL) {
+ current = leaf;
+ goto leaf_only;
+ }
+
+ /* Search the internal pages of the tree. */
+ cmp = -1;
+ current = &btree->root;
+ for (depth = 2;; ++depth) {
+restart: page = current->page;
+ if (page->type != WT_PAGE_ROW_INT)
+ break;
+
+ pindex = WT_INTL_INDEX_COPY(page);
+
+ /*
+ * Fast-path internal pages with one child, a common case for
+ * the root page in new trees.
+ */
+ if (pindex->entries == 1) {
+ descent = pindex->index[0];
+ goto descend;
+ }
+
+ /* Fast-path appends. */
+ if (append_check) {
+ descent = pindex->index[pindex->entries - 1];
+ __wt_ref_key(page, descent, &item->data, &item->size);
+ WT_ERR(__wt_compare(
+ session, collator, srch_key, item, &cmp));
+ if (cmp >= 0)
+ goto descend;
+
+ /* A failed append check turns off append checks. */
+ append_check = 0;
+ }
+
+ /*
+ * Binary search of the internal page. There are two versions
+ * (a default loop and an application-specified collation loop),
+ * because moving the collation test and error handling inside
+ * the loop costs about 5%.
+ *
+ * The 0th key on an internal page is a problem for a couple of
+ * reasons. First, we have to force the 0th key to sort less
+ * than any application key, so internal pages don't have to be
+ * updated if the application stores a new, "smallest" key in
+ * the tree. Second, reconciliation is aware of this and will
+ * store a byte of garbage in the 0th key, so the comparison of
+ * an application key and a 0th key is meaningless (but doing
+ * the comparison could still incorrectly modify our tracking
+ * of the leading bytes in each key that we can skip during the
+ * comparison). For these reasons, skip the 0th key.
+ */
+ base = 1;
+ limit = pindex->entries - 1;
+ if (collator == NULL)
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+ __wt_ref_key(
+ page, descent, &item->data, &item->size);
+
+ match = WT_MIN(skiplow, skiphigh);
+ cmp = __wt_lex_compare_skip(
+ srch_key, item, &match);
+ if (cmp > 0) {
+ skiplow = match;
+ base = indx + 1;
+ --limit;
+ } else if (cmp < 0)
+ skiphigh = match;
+ else
+ goto descend;
+ }
+ else
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ descent = pindex->index[indx];
+ __wt_ref_key(
+ page, descent, &item->data, &item->size);
+
+ WT_ERR(__wt_compare(
+ session, collator, srch_key, item, &cmp));
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ goto descend;
+ }
+
+ /*
+ * Set the slot to descend the tree: descent is already set if
+ * there was an exact match on the page, otherwise, base is
+ * the smallest index greater than key, possibly (last + 1).
+ */
+ descent = pindex->index[base - 1];
+
+ /*
+ * If we end up somewhere other than the last slot, it's not a
+ * right-side descent.
+ */
+ if (pindex->entries != base - 1)
+ descend_right = 0;
+
+descend: /*
+ * Swap the current page for the child page. If the page splits
+ * while we're retrieving it, restart the search in the current
+ * page; otherwise return on error, the swap call ensures we're
+ * holding nothing on failure.
+ */
+ switch (ret = __wt_page_swap(session, current, descent, 0)) {
+ case 0:
+ current = descent;
+ break;
+ case WT_RESTART:
+ skiphigh = skiplow = 0;
+ goto restart;
+ default:
+ return (ret);
+ }
+ }
+
+ /* Track how deep the tree gets. */
+ if (depth > btree->maximum_depth)
+ btree->maximum_depth = depth;
+
+leaf_only:
+ page = current->page;
+ cbt->ref = current;
+
+ /*
+ * In the case of a right-side tree descent during an insert, do a fast
+ * check for an append to the page, try to catch cursors appending data
+ * into the tree.
+ *
+ * It's tempting to make this test more rigorous: if a cursor inserts
+ * randomly into a two-level tree (a root referencing a single child
+ * that's empty except for an insert list), the right-side descent flag
+ * will be set and this comparison wasted. The problem resolves itself
+ * as the tree grows larger: either we're no longer doing right-side
+ * descent, or we'll avoid additional comparisons in internal pages,
+ * making up for the wasted comparison here. Similarly, the cursor's
+ * history is set any time it's an insert and a right-side descent,
+ * both to avoid a complicated/expensive test, and, in the case of
+ * multiple threads appending to the tree, we want to mark them all as
+ * appending, even if this test doesn't work.
+ */
+ if (insert && descend_right) {
+ cbt->append_tree = 1;
+
+ if (page->pg_row_entries == 0) {
+ cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
+
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ } else {
+ cbt->slot = WT_ROW_SLOT(page,
+ page->pg_row_d + (page->pg_row_entries - 1));
+
+ cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ }
+
+ WT_ERR(
+ __wt_search_insert_append(session, cbt, srch_key, &done));
+ if (done)
+ return (0);
+
+ /*
+ * Don't leave the insert list head set, code external to the
+ * search uses it.
+ */
+ cbt->ins_head = NULL;
+ }
+
+ /*
+ * Binary search of the leaf page. There are two versions (a default
+ * loop and an application-specified collation loop), because moving
+ * the collation test and error handling inside the loop costs about 5%.
+ */
+ base = 0;
+ limit = page->pg_row_entries;
+ if (collator == NULL)
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ rip = page->pg_row_d + indx;
+ WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
+ match = WT_MIN(skiplow, skiphigh);
+ cmp = __wt_lex_compare_skip(srch_key, item, &match);
+ if (cmp > 0) {
+ skiplow = match;
+ base = indx + 1;
+ --limit;
+ } else if (cmp < 0)
+ skiphigh = match;
+ else
+ goto leaf_match;
+ }
+ else
+ for (; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ rip = page->pg_row_d + indx;
+ WT_ERR(__wt_row_leaf_key(session, page, rip, item, 1));
+
+ WT_ERR(__wt_compare(
+ session, collator, srch_key, item, &cmp));
+ if (cmp > 0) {
+ base = indx + 1;
+ --limit;
+ } else if (cmp == 0)
+ goto leaf_match;
+ }
+
+ /*
+ * The best case is finding an exact match in the leaf page's WT_ROW
+ * array, probable for any read-mostly workload. Check that case and
+ * get out fast.
+ */
+ if (0) {
+leaf_match: cbt->compare = 0;
+ cbt->slot = WT_ROW_SLOT(page, rip);
+ return (0);
+ }
+
+ /*
+ * We didn't find an exact match in the WT_ROW array.
+ *
+ * Base is the smallest index greater than key and may be the 0th index
+ * or the (last + 1) index. Set the slot to be the largest index less
+ * than the key if that's possible (if base is the 0th index it means
+ * the application is inserting a key before any key found on the page).
+ *
+ * It's still possible there is an exact match, but it's on an insert
+ * list. Figure out which insert chain to search and then set up the
+ * return information assuming we'll find nothing in the insert list
+ * (we'll correct as needed inside the search routine, depending on
+ * what we find).
+ *
+ * If inserting a key smaller than any key found in the WT_ROW array,
+ * use the extra slot of the insert array, otherwise the insert array
+ * maps one-to-one to the WT_ROW array.
+ */
+ if (base == 0) {
+ cbt->compare = 1;
+ cbt->slot = WT_ROW_SLOT(page, page->pg_row_d);
+
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
+ } else {
+ cbt->compare = -1;
+ cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1));
+
+ cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot);
+ }
+
+ /* If there's no insert list, we're done. */
+ if (WT_SKIP_FIRST(cbt->ins_head) == NULL)
+ return (0);
+
+ /*
+ * Test for an append first when inserting onto an insert list, try to
+ * catch cursors repeatedly inserting at a single point.
+ */
+ if (insert) {
+ WT_ERR(
+ __wt_search_insert_append(session, cbt, srch_key, &done));
+ if (done)
+ return (0);
+ }
+ WT_ERR(__wt_search_insert(session, cbt, srch_key));
+
+ return (0);
+
+err: if (leaf != NULL)
+ WT_TRET(__wt_page_release(session, current, 0));
+ return (ret);
+}
+
+/*
+ * __wt_row_random --
+ * Return a random key from a row-store tree.
+ */
+int
+__wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_INSERT *p, *t;
+ WT_PAGE *page;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *current, *descent;
+
+ btree = S2BT(session);
+
+ __cursor_pos_clear(cbt);
+
+restart:
+ /* Walk the internal pages of the tree. */
+ current = &btree->root;
+ for (;;) {
+ page = current->page;
+ if (page->type != WT_PAGE_ROW_INT)
+ break;
+
+ pindex = WT_INTL_INDEX_COPY(page);
+ descent = pindex->index[
+ __wt_random(session->rnd) % pindex->entries];
+
+ /*
+ * Swap the parent page for the child page; return on error,
+ * the swap function ensures we're holding nothing on failure.
+ */
+ if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) {
+ current = descent;
+ continue;
+ }
+ /*
+ * Restart is returned if we find a page that's been split; the
+ * held page isn't discarded when restart is returned, discard
+ * it and restart the search from the top of the tree.
+ */
+ if (ret == WT_RESTART &&
+ (ret = __wt_page_release(session, current, 0)) == 0)
+ goto restart;
+ return (ret);
+ }
+
+ if (page->pg_row_entries != 0) {
+ /*
+ * The use case for this call is finding a place to split the
+ * tree. Cheat (it's not like this is "random", anyway), and
+ * make things easier by returning the first key on the page.
+ * If the caller is attempting to split a newly created tree,
+ * or a tree with just one big page, that's not going to work,
+ * check for that.
+ */
+ cbt->ref = current;
+ cbt->compare = 0;
+ pindex = WT_INTL_INDEX_COPY(btree->root.page);
+ cbt->slot = pindex->entries < 2 ?
+ __wt_random(session->rnd) % page->pg_row_entries : 0;
+
+ return (__wt_row_leaf_key(session,
+ page, page->pg_row_d + cbt->slot, &cbt->search_key, 0));
+ }
+
+ /*
+ * If the tree is new (and not empty), it might have a large insert
+ * list, pick the key in the middle of that insert list.
+ */
+ F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
+ if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
+ WT_ERR(WT_NOTFOUND);
+ for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) {
+ if ((p = WT_SKIP_NEXT(p)) == NULL)
+ break;
+ if ((p = WT_SKIP_NEXT(p)) == NULL)
+ break;
+ t = WT_SKIP_NEXT(t);
+ }
+ cbt->ref = current;
+ cbt->compare = 0;
+ cbt->ins = t;
+
+ return (0);
+
+err: WT_TRET(__wt_page_release(session, current, 0));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c
new file mode 100644
index 00000000000..c792cb4fcf2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config.c
@@ -0,0 +1,745 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __config_err --
+ * Error message and return for config string parse failures.
+ */
+static int
+__config_err(WT_CONFIG *conf, const char *msg, int err)
+{
+ WT_RET_MSG(conf->session, err,
+ "Error parsing '%.*s' at byte %u: %s",
+ (int)(conf->end - conf->orig), conf->orig,
+ (u_int)(conf->cur - conf->orig), msg);
+}
+
+/*
+ * __wt_config_initn --
+ * Initialize a config handle, used to iterate through a config string of
+ * specified length.
+ */
+int
+__wt_config_initn(
+ WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len)
+{
+ conf->session = session;
+ conf->orig = conf->cur = str;
+ conf->end = str + len;
+ conf->depth = 0;
+ conf->top = -1;
+ conf->go = NULL;
+
+ return (0);
+}
+
+/*
+ * __wt_config_init --
+ * Initialize a config handle, used to iterate through a NUL-terminated
+ * config string.
+ */
+int
+__wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str)
+{
+ size_t len;
+
+ len = (str == NULL) ? 0 : strlen(str);
+
+ return (__wt_config_initn(session, conf, str, len));
+}
+
+/*
+ * __wt_config_subinit --
+ * Initialize a config handle, used to iterate through a config string
+ * extracted from another config string (used for parsing nested
+ * structures).
+ */
+int
+__wt_config_subinit(
+ WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item)
+{
+ return (__wt_config_initn(session, conf, item->str, item->len));
+}
+
+#define PUSH(i, t) do { \
+ if (conf->top == -1) \
+ conf->top = conf->depth; \
+ if (conf->depth == conf->top) { \
+ if (out->len > 0) \
+ return (__config_err(conf, \
+ "New value starts without a separator", \
+ EINVAL)); \
+ out->type = (t); \
+ out->str = (conf->cur + (i)); \
+ } \
+} while (0)
+
+#define CAP(i) do { \
+ if (conf->depth == conf->top) \
+ out->len = (size_t)((conf->cur + (i) + 1) - out->str); \
+} while (0)
+
+typedef enum {
+ A_LOOP, A_BAD, A_DOWN, A_UP, A_VALUE, A_NEXT, A_QDOWN, A_QUP,
+ A_ESC, A_UNESC, A_BARE, A_NUMBARE, A_UNBARE, A_UTF8_2,
+ A_UTF8_3, A_UTF8_4, A_UTF_CONTINUE
+} CONFIG_ACTION;
+
+/*
+ * static void *gostruct[] = {
+ * [0 ... 255] = &&l_bad,
+ * ['\t'] = &&l_loop, [' '] = &&l_loop,
+ * ['\r'] = &&l_loop, ['\n'] = &&l_loop,
+ * ['"'] = &&l_qup,
+ * [':'] = &&l_value, ['='] = &&l_value,
+ * [','] = &&l_next,
+ * // tracking [] and {} individually would allow fuller
+ * // validation but is really messy
+ * ['('] = &&l_up, [')'] = &&l_down,
+ * ['['] = &&l_up, [']'] = &&l_down,
+ * ['{'] = &&l_up, ['}'] = &&l_down,
+ * // bare identifiers
+ * ['-'] = &&l_numbare,
+ * ['0' ... '9'] = &&l_numbare,
+ * ['_'] = &&l_bare,
+ * ['A' ... 'Z'] = &&l_bare, ['a' ... 'z'] = &&l_bare,
+ * ['/'] = &&l_bare,
+ * };
+ */
+static const int8_t gostruct[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_LOOP, A_LOOP, A_BAD, A_BAD, A_LOOP, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_BAD, A_QUP,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UP, A_DOWN, A_BAD, A_BAD,
+ A_NEXT, A_NUMBARE, A_BARE, A_BARE, A_NUMBARE, A_NUMBARE,
+ A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE, A_NUMBARE,
+ A_NUMBARE, A_NUMBARE, A_NUMBARE, A_VALUE, A_BAD, A_BAD,
+ A_VALUE, A_BAD, A_BAD, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD,
+ A_DOWN, A_BAD, A_BARE, A_BAD, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE,
+ A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_BARE, A_UP, A_BAD,
+ A_DOWN, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *gobare[] =
+ * {
+ * [0 ... 31] = &&l_bad,
+ * // could be more pedantic/validation-checking
+ * [32 ... 126] = &&l_loop,
+ * ['\t'] = &&l_unbare, [' '] = &&l_unbare,
+ * ['\r'] = &&l_unbare, ['\n'] = &&l_unbare,
+ * [':'] = &&l_unbare, ['='] = &&l_unbare,
+ * [','] = &&l_unbare,
+ * [')'] = &&l_unbare, [']'] = &&l_unbare, ['}'] = &&l_unbare,
+ * [127 ... 255] = &&l_bad
+ * };
+ */
+static const int8_t gobare[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_UNBARE, A_UNBARE, A_BAD, A_BAD, A_UNBARE, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNBARE,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_UNBARE, A_LOOP, A_LOOP, A_UNBARE, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_UNBARE,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_UNBARE, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *gostring[] =
+ * {
+ * [0 ... 31] = &&l_bad, [127] = &&l_bad,
+ * [32 ... 126] = &&l_loop,
+ * ['\\'] = &&l_esc, ['"'] = &&l_qdown,
+ * [128 ... 191] = &&l_bad,
+ * [192 ... 223] = &&l_utf8_2,
+ * [224 ... 239] = &&l_utf8_3,
+ * [240 ... 247] = &&l_utf8_4,
+ * [248 ... 255] = &&l_bad
+ * };
+ */
+static const int8_t gostring[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_LOOP, A_LOOP, A_QDOWN,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_ESC, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_LOOP,
+ A_LOOP, A_LOOP, A_LOOP, A_LOOP, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2, A_UTF8_2,
+ A_UTF8_2, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3,
+ A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3,
+ A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_3, A_UTF8_4,
+ A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4, A_UTF8_4,
+ A_UTF8_4, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *goutf8_continue[] =
+ * {
+ * [0 ... 127] = &&l_bad,
+ * [128 ... 191] = &&l_utf_continue,
+ * [192 ... 255] = &&l_bad
+ * };
+ */
+static const int8_t goutf8_continue[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE, A_UTF_CONTINUE,
+ A_UTF_CONTINUE, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * static void *goesc[] =
+ * {
+ * [0 ... 255] = &&l_bad,
+ * ['"'] = &&l_unesc, ['\\'] = &&l_unesc,
+ * ['/'] = &&l_unesc, ['b'] = &&l_unesc,
+ * ['f'] = &&l_unesc, ['n'] = &&l_unesc,
+ * ['r'] = &&l_unesc, ['t'] = &&l_unesc, ['u'] = &&l_unesc
+ * };
+ */
+static const int8_t goesc[256] = {
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_UNESC, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_UNESC, A_BAD,
+ A_BAD, A_BAD, A_UNESC, A_BAD, A_UNESC, A_UNESC, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD,
+ A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD, A_BAD
+};
+
+/*
+ * __config_next --
+ * Get the next config item in the string without processing the value.
+ */
+static int
+__config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG_ITEM *out = key;
+ int utf8_remain = 0;
+ static const WT_CONFIG_ITEM true_value = {
+ "", 0, 1, WT_CONFIG_ITEM_BOOL
+ };
+
+ key->len = 0;
+ /* Keys with no value default to true. */
+ *value = true_value;
+
+ if (conf->go == NULL)
+ conf->go = gostruct;
+
+ while (conf->cur < conf->end) {
+ switch (conf->go[(int)*conf->cur]) {
+ case A_LOOP:
+ break;
+
+ case A_BAD:
+ return (__config_err(
+ conf, "Unexpected character", EINVAL));
+
+ case A_DOWN:
+ --conf->depth;
+ CAP(0);
+ break;
+
+ case A_UP:
+ if (conf->top == -1)
+ conf->top = 1;
+ PUSH(0, WT_CONFIG_ITEM_STRUCT);
+ ++conf->depth;
+ break;
+
+ case A_VALUE:
+ if (conf->depth == conf->top) {
+ /*
+ * Special case: ':' is permitted in unquoted
+ * values.
+ */
+ if (out == value && *conf->cur != ':')
+ return (__config_err(conf,
+ "Value already complete", EINVAL));
+ out = value;
+ }
+ break;
+
+ case A_NEXT:
+ /*
+ * If we're at the top level and we have a complete
+ * key (and optional value), we're done.
+ */
+ if (conf->depth == conf->top && key->len > 0) {
+ ++conf->cur;
+ return (0);
+ } else
+ break;
+
+ case A_QDOWN:
+ CAP(-1);
+ conf->go = gostruct;
+ break;
+
+ case A_QUP:
+ PUSH(1, WT_CONFIG_ITEM_STRING);
+ conf->go = gostring;
+ break;
+
+ case A_ESC:
+ conf->go = goesc;
+ break;
+
+ case A_UNESC:
+ conf->go = gostring;
+ break;
+
+ case A_BARE:
+ PUSH(0, WT_CONFIG_ITEM_ID);
+ conf->go = gobare;
+ break;
+
+ case A_NUMBARE:
+ PUSH(0, WT_CONFIG_ITEM_NUM);
+ conf->go = gobare;
+ break;
+
+ case A_UNBARE:
+ CAP(-1);
+ conf->go = gostruct;
+ continue;
+
+ case A_UTF8_2:
+ conf->go = goutf8_continue;
+ utf8_remain = 1;
+ break;
+
+ case A_UTF8_3:
+ conf->go = goutf8_continue;
+ utf8_remain = 2;
+ break;
+
+ case A_UTF8_4:
+ conf->go = goutf8_continue;
+ utf8_remain = 3;
+ break;
+
+ case A_UTF_CONTINUE:
+ if (!--utf8_remain)
+ conf->go = gostring;
+ break;
+ }
+
+ conf->cur++;
+ }
+
+ /* Might have a trailing key/value without a closing brace */
+ if (conf->go == gobare) {
+ CAP(-1);
+ conf->go = gostruct;
+ }
+
+ /* Did we find something? */
+ if (conf->depth <= conf->top && key->len > 0)
+ return (0);
+
+ /* We're either at the end of the string or we failed to parse. */
+ if (conf->depth == 0)
+ return (WT_NOTFOUND);
+
+ return (__config_err(conf,
+ "Closing brackets missing from config string", EINVAL));
+}
+
+/*
+ * Arithmetic shift of a negative number is undefined by ISO/IEC 9899, and the
+ * WiredTiger API supports negative numbers. Check it's not a negative number,
+ * and then cast the shift out of paranoia.
+ */
+#define WT_SHIFT_INT64(v, s) do { \
+ if ((v) < 0) \
+ goto range; \
+ (v) = (int64_t)(((uint64_t)(v)) << (s)); \
+} while (0)
+
+/*
+ * __config_process_value --
+ * Deal with special config values like true / false.
+ */
+static int
+__config_process_value(WT_CONFIG *conf, WT_CONFIG_ITEM *value)
+{
+ char *endptr;
+
+ /* Empty values are okay: we can't do anything interesting with them. */
+ if (value->len == 0)
+ return (0);
+
+ if (value->type == WT_CONFIG_ITEM_ID) {
+ if (strncasecmp(value->str, "true", value->len) == 0) {
+ value->type = WT_CONFIG_ITEM_BOOL;
+ value->val = 1;
+ } else if (strncasecmp(value->str, "false", value->len) == 0) {
+ value->type = WT_CONFIG_ITEM_BOOL;
+ value->val = 0;
+ }
+ } else if (value->type == WT_CONFIG_ITEM_NUM) {
+ errno = 0;
+ value->val = strtoll(value->str, &endptr, 10);
+
+ /* Check any leftover characters. */
+ while (endptr < value->str + value->len)
+ switch (*endptr++) {
+ case 'b':
+ case 'B':
+ /* Byte: no change. */
+ break;
+ case 'k':
+ case 'K':
+ WT_SHIFT_INT64(value->val, 10);
+ break;
+ case 'm':
+ case 'M':
+ WT_SHIFT_INT64(value->val, 20);
+ break;
+ case 'g':
+ case 'G':
+ WT_SHIFT_INT64(value->val, 30);
+ break;
+ case 't':
+ case 'T':
+ WT_SHIFT_INT64(value->val, 40);
+ break;
+ case 'p':
+ case 'P':
+ WT_SHIFT_INT64(value->val, 50);
+ break;
+ default:
+ /*
+ * We didn't get a well-formed number. That
+ * might be okay, the required type will be
+ * checked by __wt_config_check.
+ */
+ value->type = WT_CONFIG_ITEM_ID;
+ break;
+ }
+
+ /*
+ * If we parsed the whole string but the number is out of range,
+ * report an error. Don't report an error for strings that
+ * aren't well-formed integers: if an integer is expected, that
+ * will be caught by __wt_config_check.
+ */
+ if (value->type == WT_CONFIG_ITEM_NUM && errno == ERANGE)
+ goto range;
+ }
+
+ return (0);
+
+range:
+ return (__config_err(conf, "Number out of range", ERANGE));
+}
+
+/*
+ * __wt_config_next --
+ * Get the next config item in the string and process the value.
+ */
+int
+__wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_RET(__config_next(conf, key, value));
+ return (__config_process_value(conf, value));
+}
+
+/*
+ * __config_getraw --
+ * Given a config parser, find the final value for a given key.
+ */
+static int
+__config_getraw(
+ WT_CONFIG *cparser, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value, int top)
+{
+ WT_CONFIG sparser;
+ WT_CONFIG_ITEM k, v, subk;
+ WT_DECL_RET;
+ int found;
+
+ found = 0;
+ while ((ret = __config_next(cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ continue;
+ if (k.len == key->len &&
+ strncasecmp(key->str, k.str, k.len) == 0) {
+ *value = v;
+ found = 1;
+ } else if (k.len < key->len && key->str[k.len] == '.' &&
+ strncasecmp(key->str, k.str, k.len) == 0) {
+ subk.str = key->str + k.len + 1;
+ subk.len = (key->len - k.len) - 1;
+ WT_RET(__wt_config_initn(
+ cparser->session, &sparser, v.str, v.len));
+ if ((ret =
+ __config_getraw(&sparser, &subk, value, 0)) == 0)
+ found = 1;
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (!found)
+ return (WT_NOTFOUND);
+ return (top ? __config_process_value(cparser, value) : 0);
+}
+
+/*
+ * __wt_config_get --
+ * Given a NULL-terminated list of configuration strings, find
+ * the final value for a given key.
+ */
+int
+__wt_config_get(WT_SESSION_IMPL *session,
+ const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+ WT_DECL_RET;
+ int found;
+
+ for (found = 0; *cfg != NULL; cfg++) {
+ WT_RET(__wt_config_init(session, &cparser, *cfg));
+ if ((ret = __config_getraw(&cparser, key, value, 1)) == 0)
+ found = 1;
+ else if (ret != WT_NOTFOUND)
+ return (ret);
+ }
+
+ return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __wt_config_gets --
+ * Given a NULL-terminated list of configuration strings, find the final
+ * value for a given string key.
+ */
+int
+__wt_config_gets(WT_SESSION_IMPL *session,
+ const char **cfg, const char *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG_ITEM key_item =
+ { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+ return (__wt_config_get(session, cfg, &key_item, value));
+}
+
+/*
+ * __wt_config_getone --
+ * Get the value for a given key from a single config string.
+ */
+int
+__wt_config_getone(WT_SESSION_IMPL *session,
+ const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+
+ WT_RET(__wt_config_init(session, &cparser, config));
+ return (__config_getraw(&cparser, key, value, 1));
+}
+
+/*
+ * __wt_config_getones --
+ * Get the value for a given string key from a single config string.
+ */
+int
+__wt_config_getones(WT_SESSION_IMPL *session,
+ const char *config, const char *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM key_item =
+ { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+ WT_RET(__wt_config_init(session, &cparser, config));
+ return (__config_getraw(&cparser, &key_item, value, 1));
+}
+
+/*
+ * __wt_config_gets_def --
+ * Performance hack: skip parsing config strings by hard-coding defaults.
+ *
+ * It's expensive to repeatedly parse configuration strings, so don't do
+ * it unless it's necessary in performance paths like cursor creation.
+ * Assume the second configuration string is the application's
+ * configuration string, and if it's not set (which is true most of the
+ * time), then use the supplied default value. This makes it faster to
+ * open cursors when checking for obscure open configuration strings like
+ * "next_random".
+ */
+int
+__wt_config_gets_def(WT_SESSION_IMPL *session,
+ const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value)
+{
+ static const WT_CONFIG_ITEM false_value = {
+ "", 0, 0, WT_CONFIG_ITEM_NUM
+ };
+
+ *value = false_value;
+ value->val = def;
+ if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL)
+ return (0);
+ else if (cfg[2] == NULL)
+ WT_RET_NOTFOUND_OK(
+ __wt_config_getones(session, cfg[1], key, value));
+ return (__wt_config_gets(session, cfg, key, value));
+}
+
+/*
+ * __wt_config_subgetraw --
+ * Get the value for a given key from a config string in a WT_CONFIG_ITEM.
+ * This is useful for dealing with nested structs in config strings.
+ */
+int
+__wt_config_subgetraw(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG cparser;
+
+ WT_RET(__wt_config_initn(session, &cparser, cfg->str, cfg->len));
+ return (__config_getraw(&cparser, key, value, 1));
+}
+
+/*
+ * __wt_config_subgets --
+ * Get the value for a given key from a config string in a WT_CONFIG_ITEM.
+ * This is useful for dealing with nested structs in config strings.
+ */
+int
+__wt_config_subgets(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value)
+{
+ WT_CONFIG_ITEM key_item =
+ { key, strlen(key), 0, WT_CONFIG_ITEM_STRING };
+
+ return (__wt_config_subgetraw(session, cfg, &key_item, value));
+}
diff --git a/src/third_party/wiredtiger/src/config/config_api.c b/src/third_party/wiredtiger/src/config/config_api.c
new file mode 100644
index 00000000000..42f4c117b81
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_api.c
@@ -0,0 +1,105 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __config_parser_close --
+ * WT_CONFIG_PARSER->close method.
+ */
+static int
+__config_parser_close(WT_CONFIG_PARSER *wt_config_parser)
+{
+ WT_CONFIG_PARSER_IMPL *config_parser;
+
+ config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+ if (config_parser == NULL)
+ return (EINVAL);
+
+ __wt_free(config_parser->session, config_parser);
+ return (0);
+}
+
+/*
+ * __config_parser_get --
+ * WT_CONFIG_PARSER->search method.
+ */
+static int
+__config_parser_get(WT_CONFIG_PARSER *wt_config_parser,
+ const char *key, WT_CONFIG_ITEM *cval)
+{
+ WT_CONFIG_PARSER_IMPL *config_parser;
+
+ config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+ if (config_parser == NULL)
+ return (EINVAL);
+
+ return (__wt_config_subgets(config_parser->session,
+ &config_parser->config_item, key, cval));
+}
+
+/*
+ * __config_parser_next --
+ * WT_CONFIG_PARSER->next method.
+ */
+static int
+__config_parser_next(WT_CONFIG_PARSER *wt_config_parser,
+ WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *cval)
+{
+ WT_CONFIG_PARSER_IMPL *config_parser;
+
+ config_parser = (WT_CONFIG_PARSER_IMPL *)wt_config_parser;
+
+ if (config_parser == NULL)
+ return (EINVAL);
+
+ return (__wt_config_next(&config_parser->config, key, cval));
+}
+
+/*
+ * wiredtiger_config_parser_open --
+ * Create a configuration parser.
+ */
+int
+wiredtiger_config_parser_open(WT_SESSION *wt_session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
+{
+ static const WT_CONFIG_PARSER stds = {
+ __config_parser_close,
+ __config_parser_next,
+ __config_parser_get
+ };
+ WT_CONFIG_ITEM config_item =
+ { config, len, 0, WT_CONFIG_ITEM_STRING };
+ WT_CONFIG_PARSER_IMPL *config_parser;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ *config_parserp = NULL;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ WT_RET(__wt_calloc_def(session, 1, &config_parser));
+ config_parser->iface = stds;
+ config_parser->session = session;
+
+ /*
+ * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG
+ * structure for iterations through the configuration string.
+ */
+ memcpy(&config_parser->config_item, &config_item, sizeof(config_item));
+ WT_ERR(__wt_config_initn(
+ session, &config_parser->config, config, len));
+
+ if (ret == 0)
+ *config_parserp = (WT_CONFIG_PARSER *)config_parser;
+ else
+err: __wt_free(session, config_parser);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_check.c b/src/third_party/wiredtiger/src/config/config_check.c
new file mode 100644
index 00000000000..310e54c3349
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_check.c
@@ -0,0 +1,370 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int config_check(
+ WT_SESSION_IMPL *, const WT_CONFIG_CHECK *, const char *, size_t);
+
+/*
+ * __conn_foc_add --
+ * Add a new entry into the connection's free-on-close list.
+ */
+static int
+__conn_foc_add(WT_SESSION_IMPL *session, const void *p)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * Our caller is expected to be holding any locks we need.
+ */
+ WT_RET(__wt_realloc_def(
+ session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc));
+
+ conn->foc[conn->foc_cnt++] = (void *)p;
+ return (0);
+}
+
+/*
+ * __wt_conn_foc_discard --
+ * Discard any memory the connection accumulated.
+ */
+void
+__wt_conn_foc_discard(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ size_t i;
+
+ conn = S2C(session);
+
+ /*
+ * If we have a list of chunks to free, run through the list, then
+ * free the list itself.
+ */
+ for (i = 0; i < conn->foc_cnt; ++i)
+ __wt_free(session, conn->foc[i]);
+ __wt_free(session, conn->foc);
+}
+
+/*
+ * __wt_configure_method --
+ * WT_CONNECTION.configure_method.
+ */
+int
+__wt_configure_method(WT_SESSION_IMPL *session,
+ const char *method, const char *uri,
+ const char *config, const char *type, const char *check)
+{
+ const WT_CONFIG_CHECK *cp;
+ WT_CONFIG_CHECK *checks, *newcheck;
+ const WT_CONFIG_ENTRY **epp;
+ WT_CONFIG_ENTRY *entry;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ size_t cnt;
+ char *newcheck_name, *p;
+
+ /*
+ * !!!
+ * We ignore the specified uri, that is, all new configuration options
+ * will be valid for all data sources. That's shouldn't be too bad
+ * as the worst that can happen is an application might specify some
+ * configuration option and not get an error -- the option should be
+ * ignored by the underlying implementation since it's unexpected, so
+ * there shouldn't be any real problems. Eventually I expect we will
+ * get the whole data-source thing sorted, at which time there may be
+ * configuration arrays for each data source, and that's when the uri
+ * will matter.
+ */
+ WT_UNUSED(uri);
+
+ conn = S2C(session);
+ checks = newcheck = NULL;
+ entry = NULL;
+ newcheck_name = NULL;
+
+ /* Argument checking; we only support a limited number of types. */
+ if (config == NULL)
+ WT_RET_MSG(session, EINVAL, "no configuration specified");
+ if (type == NULL)
+ WT_RET_MSG(session, EINVAL, "no configuration type specified");
+ if (strcmp(type, "boolean") != 0 && strcmp(type, "int") != 0 &&
+ strcmp(type, "list") != 0 && strcmp(type, "string") != 0)
+ WT_RET_MSG(session, EINVAL,
+ "type must be one of \"boolean\", \"int\", \"list\" or "
+ "\"string\"");
+
+ /* Find a match for the method name. */
+ for (epp = conn->config_entries; (*epp)->method != NULL; ++epp)
+ if (strcmp((*epp)->method, method) == 0)
+ break;
+ if ((*epp)->method == NULL)
+ WT_RET_MSG(session,
+ WT_NOTFOUND, "no method matching %s found", method);
+
+ /*
+ * Technically possible for threads to race, lock the connection while
+ * adding the new configuration information. We're holding the lock
+ * for an extended period of time, but configuration changes should be
+ * rare and only happen during startup.
+ */
+ __wt_spin_lock(session, &conn->api_lock);
+
+ /*
+ * Allocate new configuration entry and fill it in.
+ *
+ * The new base value is the previous base value, a separator and the
+ * new configuration string.
+ */
+ WT_ERR(__wt_calloc_def(session, 1, &entry));
+ entry->method = (*epp)->method;
+ WT_ERR(__wt_calloc_def(session,
+ strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p));
+ (void)strcpy(p, (*epp)->base);
+ (void)strcat(p, ",");
+ (void)strcat(p, config);
+ entry->base = p;
+
+ /*
+ * There may be a default value in the config argument passed in (for
+ * example, (kvs_parallelism=64"). The default value isn't part of the
+ * name, build a new one.
+ */
+ WT_ERR(__wt_strdup(session, config, &newcheck_name));
+ if ((p = strchr(newcheck_name, '=')) != NULL)
+ *p = '\0';
+
+ /*
+ * The new configuration name may replace an existing check with new
+ * information, in that case skip the old version.
+ */
+ cnt = 0;
+ if ((*epp)->checks != NULL)
+ for (cp = (*epp)->checks; cp->name != NULL; ++cp)
+ ++cnt;
+ WT_ERR(__wt_calloc_def(session, cnt + 2, &checks));
+ cnt = 0;
+ if ((*epp)->checks != NULL)
+ for (cp = (*epp)->checks; cp->name != NULL; ++cp)
+ if (strcmp(newcheck_name, cp->name) != 0)
+ checks[cnt++] = *cp;
+ newcheck = &checks[cnt];
+ newcheck->name = newcheck_name;
+ WT_ERR(__wt_strdup(session, type, &newcheck->type));
+ if (check != NULL)
+ WT_ERR(__wt_strdup(session, check, &newcheck->checks));
+ entry->checks = checks;
+
+ /*
+ * Confirm the configuration string passes the new set of
+ * checks.
+ */
+ WT_ERR(config_check(session, entry->checks, config, 0));
+
+ /*
+ * The next time this configuration is updated, we don't want to figure
+ * out which of these pieces of memory were allocated and will need to
+ * be free'd on close (this isn't a heavily used API and it's too much
+ * work); add them all to the free-on-close list now. We don't check
+ * for errors deliberately, we'd have to figure out which elements have
+ * already been added to the free-on-close array and which have not in
+ * order to avoid freeing chunks of memory twice. Again, this isn't a
+ * commonly used API and it shouldn't ever happen, just leak it.
+ */
+ (void)__conn_foc_add(session, entry->base);
+ (void)__conn_foc_add(session, entry);
+ (void)__conn_foc_add(session, checks);
+ (void)__conn_foc_add(session, newcheck->type);
+ (void)__conn_foc_add(session, newcheck->checks);
+ (void)__conn_foc_add(session, newcheck_name);
+
+ /*
+ * Instead of using locks to protect configuration information, assume
+ * we can atomically update a pointer to a chunk of memory, and because
+ * a pointer is never partially written, readers will correctly see the
+ * original or new versions of the memory. Readers might be using the
+ * old version as it's being updated, though, which means we cannot free
+ * the old chunk of memory until all possible readers have finished.
+ * Currently, that's on connection close: in other words, we can use
+ * this because it's small amounts of memory, and we really, really do
+ * not want to acquire locks every time we access configuration strings,
+ * since that's done on every API call.
+ */
+ WT_PUBLISH(*epp, entry);
+
+ if (0) {
+err: if (entry != NULL) {
+ __wt_free(session, entry->base);
+ __wt_free(session, entry);
+ }
+ __wt_free(session, checks);
+ if (newcheck != NULL) {
+ __wt_free(session, newcheck->type);
+ __wt_free(session, newcheck->checks);
+ }
+ __wt_free(session, newcheck_name);
+ }
+
+ __wt_spin_unlock(session, &conn->api_lock);
+ return (ret);
+}
+
+/*
+ * __wt_config_check --
+ * Check the keys in an application-supplied config string match what is
+ * specified in an array of check strings.
+ */
+int
+__wt_config_check(WT_SESSION_IMPL *session,
+ const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len)
+{
+ /*
+ * Callers don't check, it's a fast call without a configuration or
+ * check array.
+ */
+ return (config == NULL || entry->checks == NULL ?
+ 0 : config_check(session, entry->checks, config, config_len));
+}
+
+/*
+ * config_check --
+ * Check the keys in an application-supplied config string match what is
+ * specified in an array of check strings.
+ */
+static int
+config_check(WT_SESSION_IMPL *session,
+ const WT_CONFIG_CHECK *checks, const char *config, size_t config_len)
+{
+ WT_CONFIG parser, cparser, sparser;
+ WT_CONFIG_ITEM k, v, ck, cv, dummy;
+ WT_DECL_RET;
+ int badtype, found, i;
+
+ /*
+ * The config_len parameter is optional, and allows passing in strings
+ * that are not nul-terminated.
+ */
+ if (config_len == 0)
+ WT_RET(__wt_config_init(session, &parser, config));
+ else
+ WT_RET(__wt_config_initn(session, &parser, config, config_len));
+ while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_RET_MSG(session, EINVAL,
+ "Invalid configuration key found: '%.*s'",
+ (int)k.len, k.str);
+
+ /* Search for a matching entry. */
+ for (i = 0; checks[i].name != NULL; i++)
+ if (WT_STRING_MATCH(checks[i].name, k.str, k.len))
+ break;
+ if (checks[i].name == NULL)
+ WT_RET_MSG(session, EINVAL,
+ "unknown configuration key: '%.*s'",
+ (int)k.len, k.str);
+
+ if (strcmp(checks[i].type, "boolean") == 0) {
+ badtype = (v.type != WT_CONFIG_ITEM_BOOL &&
+ (v.type != WT_CONFIG_ITEM_NUM ||
+ (v.val != 0 && v.val != 1)));
+ } else if (strcmp(checks[i].type, "category") == 0) {
+ /* Deal with categories of the form: XXX=(XXX=blah). */
+ ret = config_check(session,
+ checks[i].subconfigs,
+ k.str + strlen(checks[i].name) + 1, v.len);
+ if (ret != EINVAL)
+ badtype = 0;
+ else
+ badtype = 1;
+ } else if (strcmp(checks[i].type, "format") == 0) {
+ badtype = 0;
+ } else if (strcmp(checks[i].type, "int") == 0) {
+ badtype = (v.type != WT_CONFIG_ITEM_NUM);
+ } else if (strcmp(checks[i].type, "list") == 0) {
+ badtype = (v.len > 0 &&
+ v.type != WT_CONFIG_ITEM_STRUCT);
+ } else if (strcmp(checks[i].type, "string") == 0) {
+ badtype = 0;
+ } else
+ WT_RET_MSG(session, EINVAL,
+ "unknown configuration type: '%s'",
+ checks[i].type);
+
+ if (badtype)
+ WT_RET_MSG(session, EINVAL,
+ "Invalid value for key '%.*s': expected a %s",
+ (int)k.len, k.str, checks[i].type);
+
+ if (checks[i].checks == NULL)
+ continue;
+
+ /* Setup an iterator for the check string. */
+ WT_RET(__wt_config_init(session, &cparser, checks[i].checks));
+ while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+ if (WT_STRING_MATCH("min", ck.str, ck.len)) {
+ if (v.val < cv.val)
+ WT_RET_MSG(session, EINVAL,
+ "Value too small for key '%.*s' "
+ "the minimum is %.*s",
+ (int)k.len, k.str,
+ (int)cv.len, cv.str);
+ } else if (WT_STRING_MATCH("max", ck.str, ck.len)) {
+ if (v.val > cv.val)
+ WT_RET_MSG(session, EINVAL,
+ "Value too large for key '%.*s' "
+ "the maximum is %.*s",
+ (int)k.len, k.str,
+ (int)cv.len, cv.str);
+ } else if (WT_STRING_MATCH("choices", ck.str, ck.len)) {
+ if (v.len == 0)
+ WT_RET_MSG(session, EINVAL,
+ "Key '%.*s' requires a value",
+ (int)k.len, k.str);
+ if (v.type == WT_CONFIG_ITEM_STRUCT) {
+ /*
+ * Handle the 'verbose' case of a list
+ * containing restricted choices.
+ */
+ WT_RET(__wt_config_subinit(session,
+ &sparser, &v));
+ found = 1;
+ while (found &&
+ (ret = __wt_config_next(&sparser,
+ &v, &dummy)) == 0) {
+ ret = __wt_config_subgetraw(
+ session, &cv, &v, &dummy);
+ found = (ret == 0);
+ }
+ } else {
+ ret = __wt_config_subgetraw(session,
+ &cv, &v, &dummy);
+ found = (ret == 0);
+ }
+
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ if (!found)
+ WT_RET_MSG(session, EINVAL,
+ "Value '%.*s' not a "
+ "permitted choice for key '%.*s'",
+ (int)v.len, v.str,
+ (int)k.len, k.str);
+ } else
+ WT_RET_MSG(session, EINVAL,
+ "unexpected configuration description "
+ "keyword %.*s", (int)ck.len, ck.str);
+ }
+ }
+
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_collapse.c b/src/third_party/wiredtiger/src/config/config_collapse.c
new file mode 100644
index 00000000000..3e4c539cbe9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_collapse.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_collapse --
+ * Collapse a set of configuration strings into newly allocated memory.
+ *
+ * This function takes a NULL-terminated list of configuration strings (where
+ * the first one contains all the defaults and the values are in order from
+ * least to most preferred, that is, the default values are least preferred),
+ * and collapses them into newly allocated memory. The algorithm is to walk
+ * the first of the configuration strings, and for each entry, search all of
+ * the configuration strings for a final value, keeping the last value found.
+ *
+ * Notes:
+ * Any key not appearing in the first configuration string is discarded
+ * from the final result, because we'll never search for it.
+ *
+ * Nested structures aren't parsed. For example, imagine a configuration
+ * string contains "key=(k2=v2,k3=v3)", and a subsequent string has
+ * "key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and
+ * use the final value of "key", regardless of field overlap or missing
+ * fields in the nested value.
+ */
+int
+__wt_config_collapse(
+ WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+
+ WT_ERR(__wt_config_init(session, &cparser, cfg[0]));
+ while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_ERR_MSG(session, EINVAL,
+ "Invalid configuration key found: '%s'\n", k.str);
+ WT_ERR(__wt_config_get(session, cfg, &k, &v));
+ /* Include the quotes around string keys/values. */
+ if (k.type == WT_CONFIG_ITEM_STRING) {
+ --k.str;
+ k.len += 2;
+ }
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+ WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,",
+ (int)k.len, k.str, (int)v.len, v.str));
+ }
+ if (ret != WT_NOTFOUND)
+ goto err;
+
+ /*
+ * If the caller passes us no valid configuration strings, we get here
+ * with no bytes to copy -- that's OK, the underlying string copy can
+ * handle empty strings.
+ *
+ * Strip any trailing comma.
+ */
+ if (tmp->size != 0)
+ --tmp->size;
+ ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * We need a character that can't appear in a key as a separator.
+ */
+#undef SEP /* separator key, character */
+#define SEP "["
+#undef SEPC
+#define SEPC '['
+
+/*
+ * Individual configuration entries, including a generation number used to make
+ * the qsort stable.
+ */
+typedef struct {
+ char *k, *v; /* key, value */
+ size_t gen; /* generation */
+} WT_CONFIG_MERGE_ENTRY;
+
+/*
+ * The array of configuration entries.
+ */
+typedef struct {
+ size_t entries_allocated; /* allocated */
+ size_t entries_next; /* next slot */
+
+ WT_CONFIG_MERGE_ENTRY *entries; /* array of entries */
+} WT_CONFIG_MERGE;
+
+/*
+ * __config_merge_scan --
+ * Walk a configuration string, inserting entries into the merged array.
+ */
+static int
+__config_merge_scan(WT_SESSION_IMPL *session,
+ const char *key, const char *value, WT_CONFIG_MERGE *cp)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(kb);
+ WT_DECL_ITEM(vb);
+ WT_DECL_RET;
+ size_t len;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &kb));
+ WT_ERR(__wt_scr_alloc(session, 0, &vb));
+
+ WT_ERR(__wt_config_init(session, &cparser, value));
+ while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_ERR_MSG(session, EINVAL,
+ "Invalid configuration key found: '%s'\n", k.str);
+
+ /* Include the quotes around string keys/values. */
+ if (k.type == WT_CONFIG_ITEM_STRING) {
+ --k.str;
+ k.len += 2;
+ }
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+
+ /*
+ * !!!
+ * We're using a JSON quote character to separate the names we
+ * create for nested structures. That's not completely safe as
+ * it's possible to quote characters in JSON such that a quote
+ * character appears as a literal character in a key name. In
+ * a few cases, applications can create their own key namespace
+ * (for example, shared library extension names), and therefore
+ * it's possible for an application to confuse us. Error if we
+ * we ever see a key with a magic character.
+ */
+ for (len = 0; len < k.len; ++len)
+ if (k.str[len] == SEPC)
+ WT_ERR_MSG(session, EINVAL,
+ "key %.*s contains a '%c' separator "
+ "character",
+ (int)k.len, (char *)k.str, SEPC);
+
+ /* Build the key/value strings. */
+ WT_ERR(__wt_buf_fmt(session,
+ kb, "%s%s%.*s",
+ key == NULL ? "" : key,
+ key == NULL ? "" : SEP,
+ (int)k.len, k.str));
+ WT_ERR(__wt_buf_fmt(session,
+ vb, "%.*s", (int)v.len, v.str));
+
+ /*
+ * If the value is a structure, recursively parse it.
+ *
+ * !!!
+ * Don't merge unless the structure has field names. WiredTiger
+ * stores checkpoint LSNs in the metadata file using nested
+ * structures without field names: "checkpoint_lsn=(1,0)", not
+ * "checkpoint_lsn=(file=1,offset=0)". The value type is still
+ * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the
+ * value.
+ */
+ if (v.type == WT_CONFIG_ITEM_STRUCT &&
+ strchr(vb->data, '=') != NULL) {
+ WT_ERR(__config_merge_scan(
+ session, kb->data, vb->data, cp));
+ continue;
+ }
+
+ /* Insert the value into the array. */
+ WT_ERR(__wt_realloc_def(session,
+ &cp->entries_allocated,
+ cp->entries_next + 1, &cp->entries));
+ WT_ERR(__wt_strndup(session,
+ kb->data, kb->size, &cp->entries[cp->entries_next].k));
+ WT_ERR(__wt_strndup(session,
+ vb->data, vb->size, &cp->entries[cp->entries_next].v));
+ cp->entries[cp->entries_next].gen = cp->entries_next;
+ ++cp->entries_next;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(&kb);
+ __wt_scr_free(&vb);
+ return (ret);
+}
+
+/*
+ * __strip_comma --
+ * Strip a trailing comma.
+ */
+static void
+__strip_comma(WT_ITEM *buf)
+{
+ if (buf->size != 0 && ((char *)buf->data)[buf->size - 1] == ',')
+ --buf->size;
+}
+
+/*
+ * __config_merge_format_next --
+ * Walk the array, building entries.
+ */
+static int
+__config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix,
+ size_t plen, size_t *enp, WT_CONFIG_MERGE *cp, WT_ITEM *build)
+{
+ WT_CONFIG_MERGE_ENTRY *ep;
+ size_t len1, len2, next;
+ char *p;
+
+ for (; *enp < cp->entries_next; ++*enp) {
+ ep = &cp->entries[*enp];
+ len1 = strlen(ep->k);
+
+ /*
+ * The entries are in sorted order, take the last entry for any
+ * key.
+ */
+ if (*enp < (cp->entries_next - 1)) {
+ len2 = strlen((ep + 1)->k);
+
+ /* Choose the last of identical keys. */
+ if (len1 == len2 &&
+ memcmp(ep->k, (ep + 1)->k, len1) == 0)
+ continue;
+
+ /*
+ * The test is complicated by matching empty entries
+ * "foo=" against nested structures "foo,bar=", where
+ * the latter is a replacement for the former.
+ */
+ if (len2 > len1 &&
+ (ep + 1)->k[len1] == SEPC &&
+ memcmp(ep->k, (ep + 1)->k, len1) == 0)
+ continue;
+ }
+
+ /*
+ * If we're skipping a prefix and this entry doesn't match it,
+ * back off one entry and pop up a level.
+ */
+ if (plen != 0 &&
+ (plen > len1 || memcmp(ep->k, prefix, plen) != 0)) {
+ --*enp;
+ break;
+ }
+
+ /*
+ * If the entry introduces a new level, recurse through that
+ * new level.
+ */
+ if ((p = strchr(ep->k + plen, SEPC)) != NULL) {
+ next = WT_PTRDIFF(p, ep->k);
+ WT_RET(__wt_buf_catfmt(session,
+ build, "%.*s=(", (int)(next - plen), ep->k + plen));
+ WT_RET(__config_merge_format_next(
+ session, ep->k, next + 1, enp, cp, build));
+ __strip_comma(build);
+ WT_RET(__wt_buf_catfmt(session, build, "),"));
+ continue;
+ }
+
+ /* Append the entry to the buffer. */
+ WT_RET(__wt_buf_catfmt(
+ session, build, "%s=%s,", ep->k + plen, ep->v));
+ }
+
+ return (0);
+}
+
+/*
+ * __config_merge_format --
+ * Take the sorted array of entries, and format them into allocated memory.
+ */
+static int
+__config_merge_format(
+ WT_SESSION_IMPL *session, WT_CONFIG_MERGE *cp, const char **config_ret)
+{
+ WT_DECL_ITEM(build);
+ WT_DECL_RET;
+ size_t entries;
+
+ WT_RET(__wt_scr_alloc(session, 4 * 1024, &build));
+
+ entries = 0;
+ WT_ERR(__config_merge_format_next(session, "", 0, &entries, cp, build));
+
+ __strip_comma(build);
+
+ ret = __wt_strndup(session, build->data, build->size, config_ret);
+
+err: __wt_scr_free(&build);
+ return (ret);
+}
+
+/*
+ * __config_merge_cmp --
+ * Qsort function: sort the config merge array.
+ */
+static int
+__config_merge_cmp(const void *a, const void *b)
+{
+ WT_CONFIG_MERGE_ENTRY *ae, *be;
+ int cmp;
+
+ ae = (WT_CONFIG_MERGE_ENTRY *)a;
+ be = (WT_CONFIG_MERGE_ENTRY *)b;
+
+ if ((cmp = strcmp(ae->k, be->k)) != 0)
+ return (cmp);
+ return (ae->gen > be->gen ? 1 : -1);
+}
+
+/*
+ * __wt_config_merge --
+ * Merge a set of configuration strings into newly allocated memory.
+ *
+ * This function takes a NULL-terminated list of configuration strings (where
+ * the values are in order from least to most preferred), and merges them into
+ * newly allocated memory. The algorithm is to walk the configuration strings
+ * and build a table of each key/value pair. The pairs are sorted based on the
+ * name and the configuration string in which they were found, and a final
+ * configuration string is built from the result.
+ *
+ * Note:
+ * Nested structures are parsed and merge. For example, if configuration
+ * strings "key=(k1=v1,k2=v2)" and "key=(k1=v2)" appear, the result will
+ * be "key=(k1=v2,k2=v2)" because the nested values are merged.
+ */
+int
+__wt_config_merge(
+ WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+ WT_CONFIG_MERGE merge;
+ WT_DECL_RET;
+ size_t i;
+
+ /* Start out with a reasonable number of entries. */
+ WT_CLEAR(merge);
+
+ WT_RET(__wt_realloc_def(
+ session, &merge.entries_allocated, 100, &merge.entries));
+
+ /* Scan the configuration strings, entering them into the array. */
+ for (; *cfg != NULL; ++cfg)
+ WT_ERR(__config_merge_scan(session, NULL, *cfg, &merge));
+
+ /*
+ * Sort the array by key and, in the case of identical keys, by
+ * generation.
+ */
+ qsort(merge.entries, merge.entries_next,
+ sizeof(WT_CONFIG_MERGE_ENTRY), __config_merge_cmp);
+
+ /* Convert the array of entries into a string. */
+ ret = __config_merge_format(session, &merge, config_ret);
+
+err: for (i = 0; i < merge.entries_next; ++i) {
+ __wt_free(session, merge.entries[i].k);
+ __wt_free(session, merge.entries[i].v);
+ }
+ __wt_free(session, merge.entries);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_concat.c b/src/third_party/wiredtiger/src/config/config_concat.c
new file mode 100644
index 00000000000..99475ef6f47
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_concat.c
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_concat --
+ * Given a NULL-terminated list of configuration strings, concatenate them
+ * into newly allocated memory. Nothing special is assumed about any of
+ * the config strings, they are simply combined in order.
+ *
+ * This code deals with the case where some of the config strings are
+ * wrapped in brackets but others aren't: the resulting string does not
+ * have brackets.
+ */
+int
+__wt_config_concat(
+ WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const char **cp;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+
+ for (cp = cfg; *cp != NULL; ++cp) {
+ WT_ERR(__wt_config_init(session, &cparser, *cp));
+ while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
+ if (k.type != WT_CONFIG_ITEM_STRING &&
+ k.type != WT_CONFIG_ITEM_ID)
+ WT_ERR_MSG(session, EINVAL,
+ "Invalid configuration key found: '%s'\n",
+ k.str);
+ /* Include the quotes around string keys/values. */
+ if (k.type == WT_CONFIG_ITEM_STRING) {
+ --k.str;
+ k.len += 2;
+ }
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+ WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s%s%.*s,",
+ (int)k.len, k.str,
+ (v.len > 0) ? "=" : "",
+ (int)v.len, v.str));
+ }
+ if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
+ /*
+ * If the caller passes us no valid configuration strings, we get here
+ * with no bytes to copy -- that's OK, the underlying string copy can
+ * handle empty strings.
+ *
+ * Strip any trailing comma.
+ */
+ if (tmp->size != 0)
+ --tmp->size;
+ ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
new file mode 100644
index 00000000000..0cd2d32df57
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -0,0 +1,744 @@
+/* DO NOT EDIT: automatically built by dist/config.py. */
+
+#include "wt_internal.h"
+
+static const WT_CONFIG_CHECK confchk_colgroup_meta[] = {
+ { "app_metadata", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "source", "string", NULL, NULL },
+ { "type", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_async_new_op[] = {
+ { "append", "boolean", NULL, NULL },
+ { "overwrite", "boolean", NULL, NULL },
+ { "raw", "boolean", NULL, NULL },
+ { "timeout", "int", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_close[] = {
+ { "leak_memory", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_load_extension[] = {
+ { "config", "string", NULL, NULL },
+ { "entry", "string", NULL, NULL },
+ { "terminate", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_open_session[] = {
+ { "isolation", "string",
+ "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_async_subconfigs[] = {
+ { "enabled", "boolean", NULL, NULL },
+ { "ops_max", "int", "min=10,max=4096", NULL },
+ { "threads", "int", "min=1,max=20", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_checkpoint_subconfigs[] = {
+ { "log_size", "int", "min=0,max=2GB", NULL },
+ { "name", "string", NULL, NULL },
+ { "wait", "int", "min=0,max=100000", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_eviction_subconfigs[] = {
+ { "threads_max", "int", "min=1,max=20", NULL },
+ { "threads_min", "int", "min=1,max=20", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_lsm_manager_subconfigs[] = {
+ { "merge", "boolean", NULL, NULL },
+ { "worker_thread_max", "int", "min=3,max=20", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_shared_cache_subconfigs[] = {
+ { "chunk", "int", "min=1MB,max=10TB", NULL },
+ { "name", "string", NULL, NULL },
+ { "reserve", "int", NULL, NULL },
+ { "size", "int", "min=1MB,max=10TB", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_statistics_log_subconfigs[] = {
+ { "on_close", "boolean", NULL, NULL },
+ { "path", "string", NULL, NULL },
+ { "sources", "list", NULL, NULL },
+ { "timestamp", "string", NULL, NULL },
+ { "wait", "int", "min=0,max=100000", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_connection_reconfigure[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_file_meta[] = {
+ { "allocation_size", "int", "min=512B,max=128MB", NULL },
+ { "app_metadata", "string", NULL, NULL },
+ { "block_allocation", "string",
+ "choices=[\"first\",\"best\"]",
+ NULL },
+ { "block_compressor", "string", NULL, NULL },
+ { "cache_resident", "boolean", NULL, NULL },
+ { "checkpoint", "string", NULL, NULL },
+ { "checkpoint_lsn", "string", NULL, NULL },
+ { "checksum", "string",
+ "choices=[\"on\",\"off\",\"uncompressed\"]",
+ NULL },
+ { "collator", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "dictionary", "int", "min=0", NULL },
+ { "format", "string", "choices=[\"btree\"]", NULL },
+ { "huffman_key", "string", NULL, NULL },
+ { "huffman_value", "string", NULL, NULL },
+ { "id", "string", NULL, NULL },
+ { "internal_item_max", "int", "min=0", NULL },
+ { "internal_key_truncate", "boolean", NULL, NULL },
+ { "internal_page_max", "int", "min=512B,max=512MB", NULL },
+ { "key_format", "format", NULL, NULL },
+ { "key_gap", "int", "min=0", NULL },
+ { "leaf_item_max", "int", "min=0", NULL },
+ { "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+ { "memory_page_max", "int", "min=512B,max=10TB", NULL },
+ { "os_cache_dirty_max", "int", "min=0", NULL },
+ { "os_cache_max", "int", "min=0", NULL },
+ { "prefix_compression", "boolean", NULL, NULL },
+ { "prefix_compression_min", "int", "min=0", NULL },
+ { "split_pct", "int", "min=25,max=100", NULL },
+ { "value_format", "format", NULL, NULL },
+ { "version", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_index_meta[] = {
+ { "app_metadata", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "key_format", "format", NULL, NULL },
+ { "source", "string", NULL, NULL },
+ { "type", "string", NULL, NULL },
+ { "value_format", "format", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_begin_transaction[] = {
+ { "isolation", "string",
+ "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+ NULL },
+ { "name", "string", NULL, NULL },
+ { "priority", "int", "min=-100,max=100", NULL },
+ { "sync", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_checkpoint[] = {
+ { "drop", "list", NULL, NULL },
+ { "force", "boolean", NULL, NULL },
+ { "name", "string", NULL, NULL },
+ { "target", "list", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_compact[] = {
+ { "timeout", "int", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_lsm_subconfigs[] = {
+ { "auto_throttle", "boolean", NULL, NULL },
+ { "bloom", "boolean", NULL, NULL },
+ { "bloom_bit_count", "int", "min=2,max=1000", NULL },
+ { "bloom_config", "string", NULL, NULL },
+ { "bloom_hash_count", "int", "min=2,max=100", NULL },
+ { "bloom_oldest", "boolean", NULL, NULL },
+ { "chunk_max", "int", "min=100MB,max=10TB", NULL },
+ { "chunk_size", "int", "min=512K,max=500MB", NULL },
+ { "merge_max", "int", "min=2,max=100", NULL },
+ { "merge_min", "int", "max=100", NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_create[] = {
+ { "allocation_size", "int", "min=512B,max=128MB", NULL },
+ { "app_metadata", "string", NULL, NULL },
+ { "block_allocation", "string",
+ "choices=[\"first\",\"best\"]",
+ NULL },
+ { "block_compressor", "string", NULL, NULL },
+ { "cache_resident", "boolean", NULL, NULL },
+ { "checksum", "string",
+ "choices=[\"on\",\"off\",\"uncompressed\"]",
+ NULL },
+ { "colgroups", "list", NULL, NULL },
+ { "collator", "string", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "dictionary", "int", "min=0", NULL },
+ { "exclusive", "boolean", NULL, NULL },
+ { "format", "string", "choices=[\"btree\"]", NULL },
+ { "huffman_key", "string", NULL, NULL },
+ { "huffman_value", "string", NULL, NULL },
+ { "internal_item_max", "int", "min=0", NULL },
+ { "internal_key_truncate", "boolean", NULL, NULL },
+ { "internal_page_max", "int", "min=512B,max=512MB", NULL },
+ { "key_format", "format", NULL, NULL },
+ { "key_gap", "int", "min=0", NULL },
+ { "leaf_item_max", "int", "min=0", NULL },
+ { "leaf_page_max", "int", "min=512B,max=512MB", NULL },
+ { "lsm", "category", NULL, confchk_lsm_subconfigs },
+ { "memory_page_max", "int", "min=512B,max=10TB", NULL },
+ { "os_cache_dirty_max", "int", "min=0", NULL },
+ { "os_cache_max", "int", "min=0", NULL },
+ { "prefix_compression", "boolean", NULL, NULL },
+ { "prefix_compression_min", "int", "min=0", NULL },
+ { "source", "string", NULL, NULL },
+ { "split_pct", "int", "min=25,max=100", NULL },
+ { "type", "string", NULL, NULL },
+ { "value_format", "format", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_drop[] = {
+ { "force", "boolean", NULL, NULL },
+ { "remove_files", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_open_cursor[] = {
+ { "append", "boolean", NULL, NULL },
+ { "bulk", "string", NULL, NULL },
+ { "checkpoint", "string", NULL, NULL },
+ { "dump", "string",
+ "choices=[\"hex\",\"json\",\"print\"]",
+ NULL },
+ { "next_random", "boolean", NULL, NULL },
+ { "overwrite", "boolean", NULL, NULL },
+ { "raw", "boolean", NULL, NULL },
+ { "readonly", "boolean", NULL, NULL },
+ { "skip_sort_check", "boolean", NULL, NULL },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"clear\"]",
+ NULL },
+ { "target", "list", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_reconfigure[] = {
+ { "isolation", "string",
+ "choices=[\"read-uncommitted\",\"read-committed\",\"snapshot\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_salvage[] = {
+ { "force", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_session_verify[] = {
+ { "dump_address", "boolean", NULL, NULL },
+ { "dump_blocks", "boolean", NULL, NULL },
+ { "dump_offsets", "list", NULL, NULL },
+ { "dump_pages", "boolean", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_table_meta[] = {
+ { "app_metadata", "string", NULL, NULL },
+ { "colgroups", "list", NULL, NULL },
+ { "columns", "list", NULL, NULL },
+ { "key_format", "format", NULL, NULL },
+ { "value_format", "format", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_log_subconfigs[] = {
+ { "archive", "boolean", NULL, NULL },
+ { "enabled", "boolean", NULL, NULL },
+ { "file_max", "int", "min=100KB,max=2GB", NULL },
+ { "path", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_transaction_sync_subconfigs[] = {
+ { "enabled", "boolean", NULL, NULL },
+ { "method", "string",
+ "choices=[\"dsync\",\"fsync\",\"none\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "config_base", "boolean", NULL, NULL },
+ { "create", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "exclusive", "boolean", NULL, NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "use_environment_priv", "boolean", NULL, NULL },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "config_base", "boolean", NULL, NULL },
+ { "create", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "exclusive", "boolean", NULL, NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "use_environment_priv", "boolean", NULL, NULL },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { "version", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { "version", "string", NULL, NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
+ { "async", "category", NULL, confchk_async_subconfigs },
+ { "buffer_alignment", "int", "min=-1,max=1MB", NULL },
+ { "cache_size", "int", "min=1MB,max=10TB", NULL },
+ { "checkpoint", "category", NULL,
+ confchk_checkpoint_subconfigs },
+ { "checkpoint_sync", "boolean", NULL, NULL },
+ { "direct_io", "list",
+ "choices=[\"checkpoint\",\"data\",\"log\"]",
+ NULL },
+ { "error_prefix", "string", NULL, NULL },
+ { "eviction", "category", NULL, confchk_eviction_subconfigs },
+ { "eviction_dirty_target", "int", "min=10,max=99", NULL },
+ { "eviction_target", "int", "min=10,max=99", NULL },
+ { "eviction_trigger", "int", "min=10,max=99", NULL },
+ { "extensions", "list", NULL, NULL },
+ { "file_extend", "list", "choices=[\"data\",\"log\"]", NULL },
+ { "hazard_max", "int", "min=15", NULL },
+ { "log", "category", NULL, confchk_log_subconfigs },
+ { "lsm_manager", "category", NULL,
+ confchk_lsm_manager_subconfigs },
+ { "lsm_merge", "boolean", NULL, NULL },
+ { "mmap", "boolean", NULL, NULL },
+ { "multiprocess", "boolean", NULL, NULL },
+ { "session_max", "int", "min=1", NULL },
+ { "shared_cache", "category", NULL,
+ confchk_shared_cache_subconfigs },
+ { "statistics", "list",
+ "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
+ NULL },
+ { "statistics_log", "category", NULL,
+ confchk_statistics_log_subconfigs },
+ { "transaction_sync", "category", NULL,
+ confchk_transaction_sync_subconfigs },
+ { "verbose", "list",
+ "choices=[\"api\",\"block\",\"checkpoint\",\"compact\",\"evict\""
+ ",\"evictserver\",\"fileops\",\"log\",\"lsm\",\"metadata\","
+ "\"mutex\",\"overflow\",\"read\",\"reconcile\",\"recovery\","
+ "\"salvage\",\"shared_cache\",\"split\",\"temporary\","
+ "\"transaction\",\"verify\",\"version\",\"write\"]",
+ NULL },
+ { NULL, NULL, NULL, NULL }
+};
+
+static const WT_CONFIG_ENTRY config_entries[] = {
+ { "colgroup.meta",
+ "app_metadata=,columns=,source=,type=file",
+ confchk_colgroup_meta
+ },
+ { "connection.add_collator",
+ "",
+ NULL
+ },
+ { "connection.add_compressor",
+ "",
+ NULL
+ },
+ { "connection.add_data_source",
+ "",
+ NULL
+ },
+ { "connection.add_extractor",
+ "",
+ NULL
+ },
+ { "connection.async_new_op",
+ "append=0,overwrite=,raw=0,timeout=1200",
+ confchk_connection_async_new_op
+ },
+ { "connection.close",
+ "leak_memory=0",
+ confchk_connection_close
+ },
+ { "connection.load_extension",
+ "config=,entry=wiredtiger_extension_init,"
+ "terminate=wiredtiger_extension_terminate",
+ confchk_connection_load_extension
+ },
+ { "connection.open_session",
+ "isolation=read-committed",
+ confchk_connection_open_session
+ },
+ { "connection.reconfigure",
+ "async=(enabled=0,ops_max=1024,threads=2),cache_size=100MB,"
+ "checkpoint=(log_size=0,name=\"WiredTigerCheckpoint\",wait=0),"
+ "error_prefix=,eviction=(threads_max=1,threads_min=1),"
+ "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+ "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=,"
+ "shared_cache=(chunk=10MB,name=,reserve=0,size=500MB),"
+ "statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
+ confchk_connection_reconfigure
+ },
+ { "cursor.close",
+ "",
+ NULL
+ },
+ { "file.meta",
+ "allocation_size=4KB,app_metadata=,block_allocation=best,"
+ "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=,"
+ "checksum=uncompressed,collator=,columns=,dictionary=0,"
+ "format=btree,huffman_key=,huffman_value=,id=,internal_item_max=0"
+ ",internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+ "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0,"
+ "prefix_compression=0,prefix_compression_min=4,split_pct=75,"
+ "value_format=u,version=(major=0,minor=0)",
+ confchk_file_meta
+ },
+ { "index.meta",
+ "app_metadata=,columns=,key_format=u,source=,type=file,"
+ "value_format=u",
+ confchk_index_meta
+ },
+ { "session.begin_transaction",
+ "isolation=,name=,priority=0,sync=",
+ confchk_session_begin_transaction
+ },
+ { "session.checkpoint",
+ "drop=,force=0,name=,target=",
+ confchk_session_checkpoint
+ },
+ { "session.close",
+ "",
+ NULL
+ },
+ { "session.commit_transaction",
+ "",
+ NULL
+ },
+ { "session.compact",
+ "timeout=1200",
+ confchk_session_compact
+ },
+ { "session.create",
+ "allocation_size=4KB,app_metadata=,block_allocation=best,"
+ "block_compressor=,cache_resident=0,checksum=uncompressed,"
+ "colgroups=,collator=,columns=,dictionary=0,exclusive=0,"
+ "format=btree,huffman_key=,huffman_value=,internal_item_max=0,"
+ "internal_key_truncate=,internal_page_max=4KB,key_format=u,"
+ "key_gap=10,leaf_item_max=0,leaf_page_max=32KB,"
+ "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=,"
+ "bloom_hash_count=8,bloom_oldest=0,chunk_max=5GB,chunk_size=10MB,"
+ "merge_max=15,merge_min=0),memory_page_max=5MB,"
+ "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,"
+ "prefix_compression_min=4,source=,split_pct=75,type=file,"
+ "value_format=u",
+ confchk_session_create
+ },
+ { "session.drop",
+ "force=0,remove_files=",
+ confchk_session_drop
+ },
+ { "session.log_printf",
+ "",
+ NULL
+ },
+ { "session.open_cursor",
+ "append=0,bulk=0,checkpoint=,dump=,next_random=0,overwrite=,raw=0"
+ ",readonly=0,skip_sort_check=0,statistics=,target=",
+ confchk_session_open_cursor
+ },
+ { "session.reconfigure",
+ "isolation=read-committed",
+ confchk_session_reconfigure
+ },
+ { "session.rename",
+ "",
+ NULL
+ },
+ { "session.rollback_transaction",
+ "",
+ NULL
+ },
+ { "session.salvage",
+ "force=0",
+ confchk_session_salvage
+ },
+ { "session.truncate",
+ "",
+ NULL
+ },
+ { "session.upgrade",
+ "",
+ NULL
+ },
+ { "session.verify",
+ "dump_address=0,dump_blocks=0,dump_offsets=,dump_pages=0",
+ confchk_session_verify
+ },
+ { "table.meta",
+ "app_metadata=,colgroups=,columns=,key_format=u,value_format=u",
+ confchk_table_meta
+ },
+ { "wiredtiger_open",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "config_base=,create=0,direct_io=,error_prefix=,"
+ "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
+ "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
+ "file_extend=,hazard_max=1000,log=(archive=,enabled=0,"
+ "file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),use_environment_priv=0,verbose=",
+ confchk_wiredtiger_open
+ },
+ { "wiredtiger_open_all",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "config_base=,create=0,direct_io=,error_prefix=,"
+ "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80,"
+ "eviction_target=80,eviction_trigger=95,exclusive=0,extensions=,"
+ "file_extend=,hazard_max=1000,log=(archive=,enabled=0,"
+ "file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
+ "minor=0)",
+ confchk_wiredtiger_open_all
+ },
+ { "wiredtiger_open_basecfg",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1),"
+ "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+ "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0"
+ ",file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),verbose=,version=(major=0,minor=0)",
+ confchk_wiredtiger_open_basecfg
+ },
+ { "wiredtiger_open_usercfg",
+ "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1,"
+ "cache_size=100MB,checkpoint=(log_size=0,"
+ "name=\"WiredTigerCheckpoint\",wait=0),checkpoint_sync=,"
+ "direct_io=,error_prefix=,eviction=(threads_max=1,threads_min=1),"
+ "eviction_dirty_target=80,eviction_target=80,eviction_trigger=95,"
+ "extensions=,file_extend=,hazard_max=1000,log=(archive=,enabled=0"
+ ",file_max=100MB,path=\"\"),lsm_manager=(merge=,"
+ "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,"
+ "session_max=100,shared_cache=(chunk=10MB,name=,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
+ "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
+ ",method=fsync),verbose=",
+ confchk_wiredtiger_open_usercfg
+ },
+ { NULL, NULL, NULL }
+};
+
+int
+__wt_conn_config_init(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ const WT_CONFIG_ENTRY *ep, **epp;
+
+ conn = S2C(session);
+
+ /* Build a list of pointers to the configuration information. */
+ WT_RET(__wt_calloc_def(session,
+ sizeof(config_entries) / sizeof(config_entries[0]), &epp));
+ conn->config_entries = epp;
+
+ /* Fill in the list to reference the default information. */
+ for (ep = config_entries;;) {
+ *epp++ = ep++;
+ if (ep->method == NULL)
+ break;
+ }
+ return (0);
+}
+
+void
+__wt_conn_config_discard(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ __wt_free(session, conn->config_entries);
+}
diff --git a/src/third_party/wiredtiger/src/config/config_ext.c b/src/third_party/wiredtiger/src/config/config_ext.c
new file mode 100644
index 00000000000..26b3799d61c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_ext.c
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_config_parser_open --
+ * WT_EXTENSION_API->config_parser_open implementation
+ */
+int
+__wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
+{
+ WT_UNUSED(wt_ext);
+ return (wiredtiger_config_parser_open(
+ wt_session, config, len, config_parserp));
+}
+
+/*
+ * __wt_ext_config_get --
+ * Given a NULL-terminated list of configuration strings, find the final
+ * value for a given string key (external API version).
+ */
+int
+__wt_ext_config_get(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key,
+ WT_CONFIG_ITEM *cval)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+ const char **cfg;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ if ((cfg = (const char **)cfg_arg) == NULL)
+ return (WT_NOTFOUND);
+ return (__wt_config_gets(session, cfg, key, cval));
+}
diff --git a/src/third_party/wiredtiger/src/config/config_upgrade.c b/src/third_party/wiredtiger/src/config/config_upgrade.c
new file mode 100644
index 00000000000..24297df839b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/config/config_upgrade.c
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_config_upgrade --
+ * Upgrade a configuration string by appended the replacement version.
+ */
+int
+__wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+ WT_CONFIG_ITEM v;
+ const char *config;
+
+ config = buf->data;
+
+ /*
+ * wiredtiger_open:
+ * lsm_merge=boolean -> lsm_manager=(merge=boolean)
+ */
+ if (__wt_config_getones(
+ session, config, "lsm_merge", &v) != WT_NOTFOUND)
+ WT_RET(__wt_buf_catfmt(session, buf,
+ ",lsm_manager=(merge=%s)", v.val ? "true" : "false"));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/api_strerror.c b/src/third_party/wiredtiger/src/conn/api_strerror.c
new file mode 100644
index 00000000000..1ad136eae12
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/api_strerror.c
@@ -0,0 +1,43 @@
+/* DO NOT EDIT: automatically built by dist/api_err.py. */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_strerror --
+ * Return a string for any error value.
+ */
+const char *
+wiredtiger_strerror(int error)
+{
+ static char errbuf[64];
+ char *p;
+
+ if (error == 0)
+ return ("Successful return: 0");
+
+ switch (error) {
+ case WT_DUPLICATE_KEY:
+ return ("WT_DUPLICATE_KEY: attempt to insert an existing key");
+ case WT_ERROR:
+ return ("WT_ERROR: non-specific WiredTiger error");
+ case WT_NOTFOUND:
+ return ("WT_NOTFOUND: item not found");
+ case WT_PANIC:
+ return ("WT_PANIC: WiredTiger library panic");
+ case WT_RESTART:
+ return ("WT_RESTART: restart the operation (internal)");
+ case WT_ROLLBACK:
+ return ("WT_ROLLBACK: conflict between concurrent operations");
+ default:
+ if (error > 0 && (p = strerror(error)) != NULL)
+ return (p);
+ break;
+ }
+
+ /*
+ * !!!
+ * Not thread-safe, but this is never supposed to happen.
+ */
+ (void)snprintf(errbuf, sizeof(errbuf), "Unknown error: %d", error);
+ return (errbuf);
+}
diff --git a/src/third_party/wiredtiger/src/conn/api_version.c b/src/third_party/wiredtiger/src/conn/api_version.c
new file mode 100644
index 00000000000..1355220c585
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/api_version.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_version --
+ * Return library version information.
+ */
+const char *
+wiredtiger_version(int *majorp, int *minorp, int *patchp)
+{
+ if (majorp != NULL)
+ *majorp = WIREDTIGER_VERSION_MAJOR;
+ if (minorp != NULL)
+ *minorp = WIREDTIGER_VERSION_MINOR;
+ if (patchp != NULL)
+ *patchp = WIREDTIGER_VERSION_PATCH;
+ return (WIREDTIGER_VERSION_STRING);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c
new file mode 100644
index 00000000000..c7562ab94c3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_api.c
@@ -0,0 +1,1573 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __conn_statistics_config(WT_SESSION_IMPL *, const char *[]);
+
+/*
+ * ext_collate --
+ * Call the collation function (external API version).
+ */
+static int
+ext_collate(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmpp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ WT_RET(__wt_compare(session, collator, first, second, cmpp));
+
+ return (0);
+}
+
+/*
+ * ext_collator_config --
+ * Given a configuration, configure the collator (external API version).
+ */
+static int
+ext_collator_config(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ WT_CONFIG_ARG *cfg_arg, WT_COLLATOR **collatorp, int *ownp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+ const char **cfg;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ /* The default is a standard lexicographic comparison. */
+ if ((cfg = (const char **)cfg_arg) == NULL)
+ return (0);
+
+ return (__wt_collator_config(session, cfg, collatorp, ownp));
+}
+
+/*
+ * __wt_collator_config --
+ * Given a configuration, configure the collator.
+ */
+int
+__wt_collator_config(WT_SESSION_IMPL *session, const char **cfg,
+ WT_COLLATOR **collatorp, int *ownp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_NAMED_COLLATOR *ncoll;
+
+ *collatorp = NULL;
+ *ownp = 0;
+
+ conn = S2C(session);
+
+ if ((ret = __wt_config_gets(session, cfg, "collator", &cval)) != 0)
+ return (ret == WT_NOTFOUND ? 0 : ret);
+
+ if (cval.len > 0) {
+ TAILQ_FOREACH(ncoll, &conn->collqh, q)
+ if (WT_STRING_MATCH(ncoll->name, cval.str, cval.len))
+ break;
+
+ if (ncoll == NULL)
+ WT_RET_MSG(session, EINVAL,
+ "unknown collator '%.*s'", (int)cval.len, cval.str);
+
+ if (ncoll->collator->customize != NULL) {
+ WT_RET(__wt_config_gets(session,
+ session->dhandle->cfg, "app_metadata", &cval));
+ WT_RET(ncoll->collator->customize(
+ ncoll->collator, &session->iface,
+ session->dhandle->name, &cval, collatorp));
+ }
+ if (*collatorp == NULL)
+ *collatorp = ncoll->collator;
+ else
+ *ownp = 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __conn_get_extension_api --
+ * WT_CONNECTION.get_extension_api method.
+ */
+static WT_EXTENSION_API *
+__conn_get_extension_api(WT_CONNECTION *wt_conn)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ conn->extension_api.conn = wt_conn;
+ conn->extension_api.err_printf = __wt_ext_err_printf;
+ conn->extension_api.msg_printf = __wt_ext_msg_printf;
+ conn->extension_api.strerror = wiredtiger_strerror;
+ conn->extension_api.scr_alloc = __wt_ext_scr_alloc;
+ conn->extension_api.scr_free = __wt_ext_scr_free;
+ conn->extension_api.collator_config = ext_collator_config;
+ conn->extension_api.collate = ext_collate;
+ conn->extension_api.config_parser_open = __wt_ext_config_parser_open;
+ conn->extension_api.config_get = __wt_ext_config_get;
+ conn->extension_api.metadata_insert = __wt_ext_metadata_insert;
+ conn->extension_api.metadata_remove = __wt_ext_metadata_remove;
+ conn->extension_api.metadata_search = __wt_ext_metadata_search;
+ conn->extension_api.metadata_update = __wt_ext_metadata_update;
+ conn->extension_api.struct_pack = __wt_ext_struct_pack;
+ conn->extension_api.struct_size = __wt_ext_struct_size;
+ conn->extension_api.struct_unpack = __wt_ext_struct_unpack;
+ conn->extension_api.transaction_id = __wt_ext_transaction_id;
+ conn->extension_api.transaction_isolation_level =
+ __wt_ext_transaction_isolation_level;
+ conn->extension_api.transaction_notify = __wt_ext_transaction_notify;
+ conn->extension_api.transaction_oldest = __wt_ext_transaction_oldest;
+ conn->extension_api.transaction_visible = __wt_ext_transaction_visible;
+ conn->extension_api.version = wiredtiger_version;
+
+ return (&conn->extension_api);
+}
+
+#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
+ extern int snappy_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+#endif
+#ifdef HAVE_BUILTIN_EXTENSION_ZLIB
+ extern int zlib_extension_init(WT_CONNECTION *, WT_CONFIG_ARG *);
+#endif
+
+/*
+ * __conn_load_default_extensions --
+ * Load extensions that are enabled via --with-builtins
+ */
+static int
+__conn_load_default_extensions(WT_CONNECTION_IMPL *conn)
+{
+ WT_UNUSED(conn);
+#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
+ WT_RET(snappy_extension_init(&conn->iface, NULL));
+#endif
+#ifdef HAVE_BUILTIN_EXTENSION_ZLIB
+ WT_RET(zlib_extension_init(&conn->iface, NULL));
+#endif
+ return (0);
+}
+
+/*
+ * __conn_load_extension --
+ * WT_CONNECTION->load_extension method.
+ */
+static int
+__conn_load_extension(
+ WT_CONNECTION *wt_conn, const char *path, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_DLH *dlh;
+ WT_SESSION_IMPL *session;
+ int (*load)(WT_CONNECTION *, WT_CONFIG_ARG *);
+ int is_local;
+ const char *init_name, *terminate_name;
+
+ dlh = NULL;
+ init_name = terminate_name = NULL;
+ is_local = (strcmp(path, "local") == 0);
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, load_extension, config, cfg);
+
+ /*
+ * This assumes the underlying shared libraries are reference counted,
+ * that is, that re-opening a shared library simply increments a ref
+ * count, and closing it simply decrements the ref count, and the last
+ * close discards the reference entirely -- in other words, we do not
+ * check to see if we've already opened this shared library.
+ */
+ WT_ERR(__wt_dlopen(session, is_local ? NULL : path, &dlh));
+
+ /*
+ * Find the load function, remember the unload function for when we
+ * close.
+ */
+ WT_ERR(__wt_config_gets(session, cfg, "entry", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &init_name));
+ WT_ERR(__wt_dlsym(session, dlh, init_name, 1, &load));
+
+ WT_ERR(__wt_config_gets(session, cfg, "terminate", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &terminate_name));
+ WT_ERR(__wt_dlsym(session, dlh, terminate_name, 0, &dlh->terminate));
+
+ /* Call the load function last, it simplifies error handling. */
+ WT_ERR(load(wt_conn, (WT_CONFIG_ARG *)cfg));
+
+ /* Link onto the environment's list of open libraries. */
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q);
+ __wt_spin_unlock(session, &conn->api_lock);
+ dlh = NULL;
+
+err: if (dlh != NULL)
+ WT_TRET(__wt_dlclose(session, dlh));
+ __wt_free(session, init_name);
+ __wt_free(session, terminate_name);
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_load_extensions --
+ * Load the list of application-configured extensions.
+ */
+static int
+__conn_load_extensions(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG subconfig;
+ WT_CONFIG_ITEM cval, skey, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(exconfig);
+ WT_DECL_ITEM(expath);
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ERR(__conn_load_default_extensions(conn));
+
+ WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
+ WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
+ while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) {
+ if (expath == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &expath));
+ WT_ERR(__wt_buf_fmt(
+ session, expath, "%.*s", (int)skey.len, skey.str));
+ if (sval.len > 0) {
+ if (exconfig == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &exconfig));
+ WT_ERR(__wt_buf_fmt(session,
+ exconfig, "%.*s", (int)sval.len, sval.str));
+ }
+ WT_ERR(conn->iface.load_extension(&conn->iface,
+ expath->data, (sval.len > 0) ? exconfig->data : NULL));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(&expath);
+ __wt_scr_free(&exconfig);
+
+ return (ret);
+}
+
+/*
+ * __conn_add_collator --
+ * WT_CONNECTION->add_collator method.
+ */
+static int
+__conn_add_collator(WT_CONNECTION *wt_conn,
+ const char *name, WT_COLLATOR *collator, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COLLATOR *ncoll;
+ WT_SESSION_IMPL *session;
+
+ ncoll = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_collator, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_calloc_def(session, 1, &ncoll));
+ WT_ERR(__wt_strdup(session, name, &ncoll->name));
+ ncoll->collator = collator;
+
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->collqh, ncoll, q);
+ ncoll = NULL;
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: if (ncoll != NULL) {
+ __wt_free(session, ncoll->name);
+ __wt_free(session, ncoll);
+ }
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_collator --
+ * Remove collator added by WT_CONNECTION->add_collator, only used
+ * internally.
+ */
+int
+__wt_conn_remove_collator(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COLLATOR *ncoll;
+
+ conn = S2C(session);
+
+ while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL) {
+ /* Call any termination method. */
+ if (ncoll->collator->terminate != NULL)
+ WT_TRET(ncoll->collator->terminate(
+ ncoll->collator, (WT_SESSION *)session));
+
+ /* Remove from the connection's list, free memory. */
+ TAILQ_REMOVE(&conn->collqh, ncoll, q);
+ __wt_free(session, ncoll->name);
+ __wt_free(session, ncoll);
+ }
+
+ return (ret);
+}
+
+/*
+ * __conn_add_compressor --
+ * WT_CONNECTION->add_compressor method.
+ */
+static int
+__conn_add_compressor(WT_CONNECTION *wt_conn,
+ const char *name, WT_COMPRESSOR *compressor, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COMPRESSOR *ncomp;
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(name);
+ WT_UNUSED(compressor);
+ ncomp = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_compressor, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_calloc_def(session, 1, &ncomp));
+ WT_ERR(__wt_strdup(session, name, &ncomp->name));
+ ncomp->compressor = compressor;
+
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->compqh, ncomp, q);
+ ncomp = NULL;
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: if (ncomp != NULL) {
+ __wt_free(session, ncomp->name);
+ __wt_free(session, ncomp);
+ }
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_compressor --
+ * remove compressor added by WT_CONNECTION->add_compressor, only used
+ * internally.
+ */
+int
+__wt_conn_remove_compressor(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_COMPRESSOR *ncomp;
+
+ conn = S2C(session);
+
+ while ((ncomp = TAILQ_FIRST(&conn->compqh)) != NULL) {
+ /* Call any termination method. */
+ if (ncomp->compressor->terminate != NULL)
+ WT_TRET(ncomp->compressor->terminate(
+ ncomp->compressor, (WT_SESSION *)session));
+
+ /* Remove from the connection's list, free memory. */
+ TAILQ_REMOVE(&conn->compqh, ncomp, q);
+ __wt_free(session, ncomp->name);
+ __wt_free(session, ncomp);
+ }
+
+ return (ret);
+}
+
+/*
+ * __conn_add_data_source --
+ * WT_CONNECTION->add_data_source method.
+ */
+static int
+__conn_add_data_source(WT_CONNECTION *wt_conn,
+ const char *prefix, WT_DATA_SOURCE *dsrc, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_DATA_SOURCE *ndsrc;
+ WT_SESSION_IMPL *session;
+
+ ndsrc = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_data_source, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_calloc_def(session, 1, &ndsrc));
+ WT_ERR(__wt_strdup(session, prefix, &ndsrc->prefix));
+ ndsrc->dsrc = dsrc;
+
+ /* Link onto the environment's list of data sources. */
+ __wt_spin_lock(session, &conn->api_lock);
+ TAILQ_INSERT_TAIL(&conn->dsrcqh, ndsrc, q);
+ ndsrc = NULL;
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: if (ndsrc != NULL) {
+ __wt_free(session, ndsrc->prefix);
+ __wt_free(session, ndsrc);
+ }
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_conn_remove_data_source --
+ * Remove data source added by WT_CONNECTION->add_data_source.
+ */
+int
+__wt_conn_remove_data_source(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_NAMED_DATA_SOURCE *ndsrc;
+
+ conn = S2C(session);
+
+ while ((ndsrc = TAILQ_FIRST(&conn->dsrcqh)) != NULL) {
+ /* Call any termination method. */
+ if (ndsrc->dsrc->terminate != NULL)
+ WT_TRET(ndsrc->dsrc->terminate(
+ ndsrc->dsrc, (WT_SESSION *)session));
+
+ /* Remove from the connection's list, free memory. */
+ TAILQ_REMOVE(&conn->dsrcqh, ndsrc, q);
+ __wt_free(session, ndsrc->prefix);
+ __wt_free(session, ndsrc);
+ }
+
+ return (ret);
+}
+
+/*
+ * __conn_add_extractor --
+ * WT_CONNECTION->add_extractor method.
+ */
+static int
+__conn_add_extractor(WT_CONNECTION *wt_conn,
+ const char *name, WT_EXTRACTOR *extractor, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(name);
+ WT_UNUSED(extractor);
+ ret = ENOTSUP;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, add_extractor, config, cfg);
+ WT_UNUSED(cfg);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_async_flush --
+ * WT_CONNECTION.async_flush method.
+ */
+static int
+__conn_async_flush(WT_CONNECTION *wt_conn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL_NOCONF(conn, session, async_flush);
+ WT_ERR(__wt_async_flush(session));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_async_new_op --
+ * WT_CONNECTION.async_new_op method.
+ */
+static int
+__conn_async_new_op(WT_CONNECTION *wt_conn, const char *uri, const char *config,
+ WT_ASYNC_CALLBACK *callback, WT_ASYNC_OP **asyncopp)
+{
+ WT_ASYNC_OP_IMPL *op;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, async_new_op, config, cfg);
+ WT_ERR(__wt_async_new_op(session, uri, config, cfg, callback, &op));
+
+ *asyncopp = &op->iface;
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_get_home --
+ * WT_CONNECTION.get_home method.
+ */
+static const char *
+__conn_get_home(WT_CONNECTION *wt_conn)
+{
+ return (((WT_CONNECTION_IMPL *)wt_conn)->home);
+}
+
+/*
+ * __conn_configure_method --
+ * WT_CONNECTION.configure_method method.
+ */
+static int
+__conn_configure_method(WT_CONNECTION *wt_conn, const char *method,
+ const char *uri, const char *config, const char *type, const char *check)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL_NOCONF(conn, session, configure_method);
+
+ ret = __wt_configure_method(session, method, uri, config, type, check);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_is_new --
+ * WT_CONNECTION->is_new method.
+ */
+static int
+__conn_is_new(WT_CONNECTION *wt_conn)
+{
+ return (((WT_CONNECTION_IMPL *)wt_conn)->is_new);
+}
+
+/*
+ * __conn_close --
+ * WT_CONNECTION->close method.
+ */
+static int
+__conn_close(WT_CONNECTION *wt_conn, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *s, *session;
+ uint32_t i;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ CONNECTION_API_CALL(conn, session, close, config, cfg);
+
+ WT_TRET(__wt_config_gets(session, cfg, "leak_memory", &cval));
+ if (cval.val != 0)
+ F_SET(conn, WT_CONN_LEAK_MEMORY);
+
+err: /*
+ * Rollback all running transactions.
+ * We do this as a separate pass because an active transaction in one
+ * session could cause trouble when closing a file, even if that
+ * session never referenced that file.
+ */
+ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+ if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL) &&
+ F_ISSET(&s->txn, TXN_RUNNING)) {
+ wt_session = &s->iface;
+ WT_TRET(wt_session->rollback_transaction(
+ wt_session, NULL));
+ }
+
+ /* Close open, external sessions. */
+ for (s = conn->sessions, i = 0; i < conn->session_cnt; ++s, ++i)
+ if (s->active && !F_ISSET(s, WT_SESSION_INTERNAL)) {
+ wt_session = &s->iface;
+ /*
+ * Notify the user that we are closing the session
+ * handle via the registered close callback.
+ */
+ if (s->event_handler->handle_close != NULL)
+ WT_TRET(s->event_handler->handle_close(
+ s->event_handler, wt_session, NULL));
+ WT_TRET(wt_session->close(wt_session, config));
+ }
+
+ WT_TRET(__wt_connection_close(conn));
+
+ /* We no longer have a session, don't try to update it. */
+ session = NULL;
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_reconfigure --
+ * WT_CONNECTION->reconfigure method.
+ */
+static int
+__conn_reconfigure(WT_CONNECTION *wt_conn, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *p, *config_cfg[] = { NULL, NULL, NULL };
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+
+ CONNECTION_API_CALL(conn, session, reconfigure, config, cfg);
+ WT_UNUSED(cfg);
+
+ /* Serialize reconfiguration. */
+ __wt_spin_lock(session, &conn->reconfig_lock);
+
+ /*
+ * The configuration argument has been checked for validity, replace the
+ * previous connection configuration.
+ *
+ * DO NOT merge the configuration before the reconfigure calls. Some
+ * of the underlying reconfiguration functions do explicit checks with
+ * the second element of the configuration array, knowing the defaults
+ * are in slot #1 and the application's modifications are in slot #2.
+ */
+ config_cfg[0] = conn->cfg;
+ config_cfg[1] = config;
+
+ WT_ERR(__conn_statistics_config(session, config_cfg));
+ WT_ERR(__wt_async_reconfig(session, config_cfg));
+ WT_ERR(__wt_cache_config(session, config_cfg));
+ WT_ERR(__wt_cache_pool_config(session, config_cfg));
+ WT_ERR(__wt_checkpoint_server_create(session, config_cfg));
+ WT_ERR(__wt_lsm_manager_reconfig(session, config_cfg));
+ WT_ERR(__wt_statlog_create(session, config_cfg));
+ WT_ERR(__wt_verbose_config(session, config_cfg));
+
+ WT_ERR(__wt_config_merge(session, config_cfg, &p));
+ __wt_free(session, conn->cfg);
+ conn->cfg = p;
+
+err: __wt_spin_unlock(session, &conn->reconfig_lock);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __conn_open_session --
+ * WT_CONNECTION->open_session method.
+ */
+static int
+__conn_open_session(WT_CONNECTION *wt_conn,
+ WT_EVENT_HANDLER *event_handler, const char *config,
+ WT_SESSION **wt_sessionp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session, *session_ret;
+
+ *wt_sessionp = NULL;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ session_ret = NULL;
+
+ CONNECTION_API_CALL(conn, session, open_session, config, cfg);
+ WT_UNUSED(cfg);
+
+ WT_ERR(__wt_open_session(conn, event_handler, config, &session_ret));
+
+ *wt_sessionp = &session_ret->iface;
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __conn_config_append --
+ * Append an entry to a config stack.
+ */
+static void
+__conn_config_append(const char *cfg[], const char *config)
+{
+ while (*cfg != NULL)
+ ++cfg;
+ *cfg = config;
+}
+
+/*
+ * __conn_config_check_version --
+ * Check if a configuration version isn't compatible.
+ */
+static int
+__conn_config_check_version(WT_SESSION_IMPL *session, const char *config)
+{
+ WT_CONFIG_ITEM vmajor, vminor;
+
+ /*
+ * Version numbers aren't included in all configuration strings, but
+ * we check all of them just in case. Ignore configurations without
+ * a version.
+ */
+ if (__wt_config_getones(
+ session, config, "version.major", &vmajor) == WT_NOTFOUND)
+ return (0);
+ WT_RET(__wt_config_getones(session, config, "version.minor", &vminor));
+
+ if (vmajor.val > WIREDTIGER_VERSION_MAJOR ||
+ (vmajor.val == WIREDTIGER_VERSION_MAJOR &&
+ vminor.val > WIREDTIGER_VERSION_MINOR))
+ WT_RET_MSG(session, ENOTSUP,
+ "WiredTiger configuration is from an incompatible release "
+ "of the WiredTiger engine");
+
+ return (0);
+}
+
+/*
+ * __conn_config_file --
+ * Read WiredTiger config files from the home directory.
+ */
+static int
+__conn_config_file(WT_SESSION_IMPL *session,
+ const char *filename, int is_user, const char **cfg, WT_ITEM *cbuf)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *fh;
+ size_t len;
+ wt_off_t size;
+ int exist, quoted;
+ char *p, *t;
+
+ conn = S2C(session);
+ fh = NULL;
+
+ /* Configuration files are always optional. */
+ WT_RET(__wt_exist(session, filename, &exist));
+ if (!exist)
+ return (0);
+
+ /*
+ * The base configuration should not exist if we are creating this
+ * database.
+ */
+ if (!is_user && conn->is_new)
+ WT_RET_MSG(session, EINVAL,
+ "%s exists before database creation", filename);
+
+ /* Open the configuration file. */
+ WT_RET(__wt_open(session, filename, 0, 0, 0, &fh));
+ WT_ERR(__wt_filesize(session, fh, &size));
+ if (size == 0)
+ goto err;
+
+ /*
+ * Sanity test: a 100KB configuration file would be insane. (There's
+ * no practical reason to limit the file size, but I can either limit
+ * the file size to something rational, or add code to test if the
+ * wt_off_t size is larger than a uint32_t, which is more complicated
+ * and a waste of time.)
+ */
+ if (size > 100 * 1024)
+ WT_ERR_MSG(
+ session, EFBIG, "Configuration file too big: %s", filename);
+ len = (size_t)size;
+
+ /*
+ * Copy the configuration file into memory, with a little slop, I'm not
+ * interested in debugging off-by-ones.
+ *
+ * The beginning of a file is the same as if we run into an unquoted
+ * newline character, simplify the parsing loop by pretending that's
+ * what we're doing.
+ */
+ WT_ERR(__wt_buf_init(session, cbuf, len + 10));
+ WT_ERR(__wt_read(
+ session, fh, (wt_off_t)0, len, ((uint8_t *)cbuf->mem) + 1));
+ ((uint8_t *)cbuf->mem)[0] = '\n';
+ cbuf->size = len + 1;
+
+ /*
+ * Collapse the file's lines into a single string: newline characters
+ * are replaced with commas unless the newline is quoted or backslash
+ * escaped. Comment lines (an unescaped newline where the next non-
+ * white-space character is a hash), are discarded.
+ */
+ for (quoted = 0, p = t = cbuf->mem; len > 0;) {
+ /*
+ * Backslash pairs pass through untouched, unless immediately
+ * preceding a newline, in which case both the backslash and
+ * the newline are discarded. Backslash characters escape
+ * quoted characters, too, that is, a backslash followed by a
+ * quote doesn't start or end a quoted string.
+ */
+ if (*p == '\\' && len > 1) {
+ if (p[1] != '\n') {
+ *t++ = p[0];
+ *t++ = p[1];
+ }
+ p += 2;
+ len -= 2;
+ continue;
+ }
+
+ /*
+ * If we're in a quoted string, or starting a quoted string,
+ * take all characters, including white-space and newlines.
+ */
+ if (quoted || *p == '"') {
+ if (*p == '"')
+ quoted = !quoted;
+ *t++ = *p++;
+ --len;
+ continue;
+ }
+
+ /* Everything else gets taken, except for newline characters. */
+ if (*p != '\n') {
+ *t++ = *p++;
+ --len;
+ continue;
+ }
+
+ /*
+ * Replace any newline characters with commas (and strings of
+ * commas are safe).
+ *
+ * After any newline, skip to a non-white-space character; if
+ * the next character is a hash mark, skip to the next newline.
+ */
+ for (;;) {
+ for (*t++ = ','; --len > 0 && isspace(*++p);)
+ ;
+ if (len == 0)
+ break;
+ if (*p != '#')
+ break;
+ while (--len > 0 && *++p != '\n')
+ ;
+ if (len == 0)
+ break;
+ }
+ }
+ *t = '\0';
+ cbuf->size = WT_PTRDIFF(t, cbuf->data);
+
+ /* Check any version. */
+ WT_ERR(__conn_config_check_version(session, cbuf->data));
+
+ /* Upgrade the configuration string. */
+ WT_ERR(__wt_config_upgrade(session, cbuf));
+
+ /* Check the configuration information. */
+ WT_ERR(__wt_config_check(session, is_user ?
+ WT_CONFIG_REF(session, wiredtiger_open_usercfg) :
+ WT_CONFIG_REF(session, wiredtiger_open_basecfg), cbuf->data, 0));
+
+ /* Append it to the stack. */
+ __conn_config_append(cfg, cbuf->data);
+
+err: if (fh != NULL)
+ WT_TRET(__wt_close(session, fh));
+ return (ret);
+}
+
+/*
+ * __conn_config_env --
+ * Read configuration from an environment variable, if set.
+ */
+static int
+__conn_config_env(WT_SESSION_IMPL *session, const char *cfg[], WT_ITEM *cbuf)
+{
+ WT_CONFIG_ITEM cval;
+ const char *env_config;
+ size_t len;
+
+ if ((env_config = getenv("WIREDTIGER_CONFIG")) == NULL)
+ return (0);
+ len = strlen(env_config);
+ if (len == 0)
+ return (0);
+ WT_RET(__wt_buf_set(session, cbuf, env_config, len + 1));
+
+ /*
+ * Security stuff:
+ *
+ * If the "use_environment_priv" configuration string is set, use the
+ * environment variable if the process has appropriate privileges.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval));
+ if (cval.val == 0 && __wt_has_priv())
+ WT_RET_MSG(session, WT_ERROR, "%s",
+ "WIREDTIGER_CONFIG environment variable set but process "
+ "lacks privileges to use that environment variable");
+
+ /* Check any version. */
+ WT_RET(__conn_config_check_version(session, env_config));
+
+ /* Upgrade the configuration string. */
+ WT_RET(__wt_config_upgrade(session, cbuf));
+
+ /* Check the configuration information. */
+ WT_RET(__wt_config_check(session,
+ WT_CONFIG_REF(session, wiredtiger_open), env_config, 0));
+
+ /* Append it to the stack. */
+ __conn_config_append(cfg, env_config);
+
+ return (0);
+}
+
+/*
+ * __conn_home --
+ * Set the database home directory.
+ */
+static int
+__conn_home(WT_SESSION_IMPL *session, const char *home, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+
+ /* If the application specifies a home directory, use it. */
+ if (home != NULL)
+ goto copy;
+
+ /* If there's no WIREDTIGER_HOME environment variable, use ".". */
+ if ((home = getenv("WIREDTIGER_HOME")) == NULL || strlen(home) == 0) {
+ home = ".";
+ goto copy;
+ }
+
+ /*
+ * Security stuff:
+ *
+ * Unless the "use_environment_priv" configuration string is set,
+ * fail if the process is running with special privileges.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "use_environment_priv", &cval));
+ if (cval.val == 0 && __wt_has_priv())
+ WT_RET_MSG(session, WT_ERROR, "%s",
+ "WIREDTIGER_HOME environment variable set but process "
+ "lacks privileges to use that environment variable");
+
+copy: return (__wt_strdup(session, home, &S2C(session)->home));
+}
+
+/*
+ * __conn_single --
+ * Confirm that no other thread of control is using this database.
+ */
+static int
+__conn_single(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn, *t;
+ WT_DECL_RET;
+ WT_FH *fh;
+ size_t len;
+ wt_off_t size;
+ char buf[256];
+
+ conn = S2C(session);
+ fh = NULL;
+
+ __wt_spin_lock(session, &__wt_process.spinlock);
+
+ /*
+ * We first check for other threads of control holding a lock on this
+ * database, because the byte-level locking functions are based on the
+ * POSIX 1003.1 fcntl APIs, which require all locks associated with a
+ * file for a given process are removed when any file descriptor for
+ * the file is closed by that process. In other words, we can't open a
+ * file handle on the lock file until we are certain that closing that
+ * handle won't discard the owning thread's lock. Applications hopefully
+ * won't open a database in multiple threads, but we don't want to have
+ * it fail the first time, but succeed the second.
+ */
+ TAILQ_FOREACH(t, &__wt_process.connqh, q)
+ if (t->home != NULL &&
+ t != conn && strcmp(t->home, conn->home) == 0) {
+ ret = EBUSY;
+ break;
+ }
+ if (ret != 0)
+ WT_ERR_MSG(session, EBUSY,
+ "WiredTiger database is already being managed by another "
+ "thread in this process");
+
+ /*
+ * !!!
+ * Be careful changing this code.
+ *
+ * We locked the WiredTiger file before release 2.3.2; a separate lock
+ * file was added after 2.3.1 because hot backup has to copy the
+ * WiredTiger file and system utilities on Windows can't copy locked
+ * files.
+ *
+ * For this reason, we don't use the lock file's existence to decide if
+ * we're creating the database or not, use the WiredTiger file instead,
+ * it has existed in every version of WiredTiger.
+ *
+ * Additionally, avoid an upgrade race: a 2.3.1 release process might
+ * have the WiredTiger file locked, and we're going to create the lock
+ * file and lock it instead. For this reason, first acquire a lock on
+ * the lock file and then a lock on the WiredTiger file, then release
+ * the latter so hot backups can proceed. (If someone were to run a
+ * current release and subsequently a historic release, we could still
+ * fail because the historic release will ignore our lock file and will
+ * then successfully lock the WiredTiger file, but I can't think of any
+ * way to fix that.)
+ *
+ * Open the WiredTiger lock file, creating it if it doesn't exist. (I'm
+ * not removing the lock file if we create it and subsequently fail, it
+ * isn't simple to detect that case, and there's no risk other than a
+ * useless file being left in the directory.)
+ */
+ WT_ERR(__wt_open(session, WT_SINGLETHREAD, 1, 0, 0, &conn->lock_fh));
+
+ /*
+ * Lock a byte of the file: if we don't get the lock, some other process
+ * is holding it, we're done. The file may be zero-length, and that's
+ * OK, the underlying call supports locking past the end-of-file.
+ */
+ if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, 1) != 0)
+ WT_ERR_MSG(session, EBUSY,
+ "WiredTiger database is already being managed by another "
+ "process");
+
+ /*
+ * If the size of the lock file is 0, we created it (or we won a locking
+ * race with the thread that created it, it doesn't matter).
+ *
+ * Write something into the file, zero-length files make me nervous.
+ */
+ WT_ERR(__wt_filesize(session, conn->lock_fh, &size));
+ if (size == 0) {
+#define WT_SINGLETHREAD_STRING "WiredTiger lock file\n"
+ WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0,
+ strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING));
+ }
+
+ /* We own the lock file, optionally create the WiredTiger file. */
+ WT_ERR(__wt_config_gets(session, cfg, "create", &cval));
+ WT_ERR(__wt_open(session,
+ WT_WIREDTIGER, cval.val == 0 ? 0 : 1, 0, 0, &fh));
+
+ /*
+ * Lock the WiredTiger file (for backward compatibility reasons as
+ * described above). Immediately release the lock, it's just a test.
+ */
+ if (__wt_bytelock(fh, (wt_off_t)0, 1) != 0) {
+ WT_ERR_MSG(session, EBUSY,
+ "WiredTiger database is already being managed by another "
+ "process");
+ }
+ WT_ERR(__wt_bytelock(fh, (wt_off_t)0, 0));
+
+ /*
+ * If the size of the file is zero, we created it, fill it in. If the
+ * size of the file is non-zero, fail if configured for exclusivity.
+ */
+ WT_ERR(__wt_filesize(session, fh, &size));
+ if (size == 0) {
+ len = (size_t)snprintf(buf, sizeof(buf),
+ "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING);
+ WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf));
+
+ conn->is_new = 1;
+ } else {
+ WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval));
+ if (cval.val != 0)
+ WT_ERR_MSG(session, EEXIST,
+ "WiredTiger database already exists and exclusive "
+ "option configured");
+
+ conn->is_new = 0;
+ }
+
+err: /*
+ * We ignore the connection's lock file handle on error, it will be
+ * closed when the connection structure is destroyed.
+ */
+ if (fh != NULL)
+ WT_TRET(__wt_close(session, fh));
+
+ __wt_spin_unlock(session, &__wt_process.spinlock);
+ return (ret);
+}
+
+/*
+ * __conn_statistics_config --
+ * Set statistics configuration.
+ */
+static int
+__conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ uint32_t flags;
+ int set;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics", &cval));
+
+ flags = 0;
+ set = 0;
+ if ((ret = __wt_config_subgets(
+ session, &cval, "none", &sval)) == 0 && sval.val != 0) {
+ LF_SET(WT_CONN_STAT_NONE);
+ ++set;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if ((ret = __wt_config_subgets(
+ session, &cval, "fast", &sval)) == 0 && sval.val != 0) {
+ LF_SET(WT_CONN_STAT_FAST);
+ ++set;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if ((ret = __wt_config_subgets(
+ session, &cval, "all", &sval)) == 0 && sval.val != 0) {
+ LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST);
+ ++set;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if ((ret = __wt_config_subgets(
+ session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+ LF_SET(WT_CONN_STAT_CLEAR);
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (set > 1)
+ WT_RET_MSG(session, EINVAL,
+ "only one statistics configuration value may be specified");
+
+ /* Configuring statistics clears any existing values. */
+ conn->stat_flags = flags;
+
+ return (0);
+}
+
+/* Simple structure for name and flag configuration searches. */
+typedef struct {
+ const char *name;
+ uint32_t flag;
+} WT_NAME_FLAG;
+
+/*
+ * __wt_verbose_config --
+ * Set verbose configuration.
+ */
+int
+__wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ static const WT_NAME_FLAG verbtypes[] = {
+ { "api", WT_VERB_API },
+ { "block", WT_VERB_BLOCK },
+ { "checkpoint", WT_VERB_CHECKPOINT },
+ { "compact", WT_VERB_COMPACT },
+ { "evict", WT_VERB_EVICT },
+ { "evictserver", WT_VERB_EVICTSERVER },
+ { "fileops", WT_VERB_FILEOPS },
+ { "log", WT_VERB_LOG },
+ { "lsm", WT_VERB_LSM },
+ { "metadata", WT_VERB_METADATA },
+ { "mutex", WT_VERB_MUTEX },
+ { "overflow", WT_VERB_OVERFLOW },
+ { "read", WT_VERB_READ },
+ { "reconcile", WT_VERB_RECONCILE },
+ { "recovery", WT_VERB_RECOVERY },
+ { "salvage", WT_VERB_SALVAGE },
+ { "shared_cache", WT_VERB_SHARED_CACHE },
+ { "split", WT_VERB_SPLIT },
+ { "temporary", WT_VERB_TEMPORARY },
+ { "transaction", WT_VERB_TRANSACTION },
+ { "verify", WT_VERB_VERIFY },
+ { "version", WT_VERB_VERSION },
+ { "write", WT_VERB_WRITE },
+ { NULL, 0 }
+ };
+ WT_CONFIG_ITEM cval, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ const WT_NAME_FLAG *ft;
+ uint32_t flags;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "verbose", &cval));
+
+ flags = 0;
+ for (ft = verbtypes; ft->name != NULL; ft++) {
+ if ((ret = __wt_config_subgets(
+ session, &cval, ft->name, &sval)) == 0 && sval.val != 0) {
+#ifdef HAVE_VERBOSE
+ LF_SET(ft->flag);
+#else
+ WT_RET_MSG(session, EINVAL,
+ "Verbose option specified when WiredTiger built "
+ "without verbose support. Add --enable-verbose to "
+ "configure command and rebuild to include support "
+ "for verbose messages");
+#endif
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ }
+
+ conn->verbose = flags;
+ return (0);
+}
+
+/*
+ * __conn_write_config --
+ * Save the configuration used to create a database.
+ */
+static int
+__conn_write_config(
+ WT_SESSION_IMPL *session, const char *filename, const char *cfg[])
+{
+ FILE *fp;
+ WT_CONFIG parser;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ char *path;
+
+ /*
+ * We were passed an array of configuration strings where slot 0 is all
+ * all possible values and the second and subsequent slots are changes
+ * specified by the application during open (using the wiredtiger_open
+ * configuration string, an environment variable, or user-configuration
+ * file). The base configuration file contains all changes to default
+ * settings made at create, and we include the user-configuration file
+ * in that list, even though we don't expect it to change. Of course,
+ * an application could leave that file as it is right now and not
+ * remove a configuration we need, but applications can also guarantee
+ * all database users specify consistent environment variables and
+ * wiredtiger_open configuration arguments, and if we protect against
+ * those problems, might as well include the application's configuration
+ * file as well.
+ *
+ * If there is no configuration, don't bother creating an empty file.
+ */
+ if (cfg[1] == NULL)
+ return (0);
+
+ WT_RET(__wt_filename(session, filename, &path));
+ if ((fp = fopen(path, "w")) == NULL)
+ ret = __wt_errno();
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (ret);
+
+ fprintf(fp, "%s\n\n",
+ "# Do not modify this file.\n"
+ "#\n"
+ "# WiredTiger created this file when the database was created,\n"
+ "# to store persistent database settings. Instead of changing\n"
+ "# these settings, set a WIREDTIGER_CONFIG environment variable\n"
+ "# or create a WiredTiger.config file to override them.");
+
+ fprintf(fp, "version=(major=%d,minor=%d)\n\n",
+ WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
+
+ /*
+ * We want the list of defaults that have been changed, that is, if the
+ * application didn't somehow configure a setting, we don't write out a
+ * default value, so future releases may silently migrate to new default
+ * values.
+ */
+ while (*++cfg != NULL) {
+ WT_ERR(__wt_config_init( session,
+ &parser, WT_CONFIG_BASE(session, wiredtiger_open_basecfg)));
+ while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
+ if ((ret =
+ __wt_config_getone(session, *cfg, &k, &v)) == 0) {
+ /* Fix quoting for non-trivial settings. */
+ if (v.type == WT_CONFIG_ITEM_STRING) {
+ --v.str;
+ v.len += 2;
+ }
+ fprintf(fp, "%.*s=%.*s\n",
+ (int)k.len, k.str, (int)v.len, v.str);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+err: WT_TRET(fclose(fp));
+
+ /* Don't leave a damaged file in place. */
+ if (ret != 0)
+ (void)__wt_remove(session, filename);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_open --
+ * Main library entry point: open a new connection to a WiredTiger
+ * database.
+ */
+int
+wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
+ const char *config, WT_CONNECTION **wt_connp)
+{
+ static const WT_CONNECTION stdc = {
+ __conn_async_flush,
+ __conn_async_new_op,
+ __conn_close,
+ __conn_reconfigure,
+ __conn_get_home,
+ __conn_configure_method,
+ __conn_is_new,
+ __conn_open_session,
+ __conn_load_extension,
+ __conn_add_data_source,
+ __conn_add_collator,
+ __conn_add_compressor,
+ __conn_add_extractor,
+ __conn_get_extension_api
+ };
+ static const WT_NAME_FLAG file_types[] = {
+ { "checkpoint", WT_FILE_TYPE_CHECKPOINT },
+ { "data", WT_FILE_TYPE_DATA },
+ { "log", WT_FILE_TYPE_LOG },
+ { NULL, 0 }
+ };
+
+ WT_CONFIG_ITEM cval, sval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_ITEM i1, i2, i3;
+ const WT_NAME_FLAG *ft;
+ WT_SESSION_IMPL *session;
+
+ /* Leave space for optional additional configuration. */
+ const char *cfg[] = { NULL, NULL, NULL, NULL, NULL, NULL };
+
+ *wt_connp = NULL;
+
+ conn = NULL;
+ session = NULL;
+
+ /*
+ * We could use scratch buffers, but I'd rather the default session
+ * not tie down chunks of memory past the open call.
+ */
+ WT_CLEAR(i1);
+ WT_CLEAR(i2);
+ WT_CLEAR(i3);
+
+ WT_RET(__wt_library_init());
+
+ WT_RET(__wt_calloc_def(NULL, 1, &conn));
+ conn->iface = stdc;
+
+ /*
+ * Immediately link the structure into the connection structure list:
+ * the only thing ever looked at on that list is the database name,
+ * and a NULL value is fine.
+ */
+ __wt_spin_lock(NULL, &__wt_process.spinlock);
+ TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q);
+ __wt_spin_unlock(NULL, &__wt_process.spinlock);
+
+ session = conn->default_session = &conn->dummy_session;
+ session->iface.connection = &conn->iface;
+ session->name = "wiredtiger_open";
+ __wt_random_init(session->rnd);
+ __wt_event_handler_set(session, event_handler);
+
+ /* Remaining basic initialization of the connection structure. */
+ WT_ERR(__wt_connection_init(conn));
+
+ /* Check/set the application-specified configuration string. */
+ WT_ERR(__wt_config_check(session,
+ WT_CONFIG_REF(session, wiredtiger_open), config, 0));
+ cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open);
+ cfg[1] = config;
+
+ /* Configure error messages so we get them right early. */
+ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
+ if (cval.len != 0)
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &conn->error_prefix));
+
+ /* Get the database home. */
+ WT_ERR(__conn_home(session, home, cfg));
+
+ /* Make sure no other thread of control already owns this database. */
+ WT_ERR(__conn_single(session, cfg));
+
+ /*
+ * Build the configuration stack, in the following order (where later
+ * entries override earlier entries):
+ *
+ * 1. all possible wiredtiger_open configurations
+ * 2. base configuration file, created with the database (optional)
+ * 3. the config passed in by the application.
+ * 4. user configuration file (optional)
+ * 5. environment variable settings (optional)
+ *
+ * Clear the entries we added to the stack, we're going to build it in
+ * order.
+ */
+ cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open_all);
+ cfg[1] = NULL;
+ WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, &i1));
+ __conn_config_append(cfg, config);
+ WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, &i2));
+ WT_ERR(__conn_config_env(session, cfg, &i3));
+
+ /*
+ * Configuration ...
+ *
+ * We can't open sessions yet, so any configurations that cause
+ * sessions to be opened must be handled inside __wt_connection_open.
+ *
+ * The error message configuration might have changed (if set in a
+ * configuration file, and not in the application's configuration
+ * string), get it again. Do it first, make error messages correct.
+ */
+ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
+ if (cval.len != 0) {
+ __wt_free(session, conn->error_prefix);
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &conn->error_prefix));
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval));
+ conn->hazard_max = (uint32_t)cval.val;
+
+ WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
+ conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+
+ WT_ERR(__wt_config_gets(session, cfg, "checkpoint_sync", &cval));
+ if (cval.val)
+ F_SET(conn, WT_CONN_CKPT_SYNC);
+
+ WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval));
+ if (cval.val == -1)
+ conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT;
+ else
+ conn->buffer_alignment = (size_t)cval.val;
+#ifndef HAVE_POSIX_MEMALIGN
+ if (conn->buffer_alignment != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "buffer_alignment requires posix_memalign");
+#endif
+
+ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval));
+ for (ft = file_types; ft->name != NULL; ft++) {
+ ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+ if (ret == 0) {
+ if (sval.val)
+ FLD_SET(conn->direct_io, ft->flag);
+ } else if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "file_extend", &cval));
+ for (ft = file_types; ft->name != NULL; ft++) {
+ ret = __wt_config_subgets(session, &cval, ft->name, &sval);
+ if (ret == 0) {
+ switch (ft->flag) {
+ case WT_FILE_TYPE_DATA:
+ conn->data_extend_len = sval.val;
+ break;
+ case WT_FILE_TYPE_LOG:
+ conn->log_extend_len = sval.val;
+ break;
+ }
+ } else if (ret != WT_NOTFOUND)
+ goto err;
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
+ conn->mmap = cval.val == 0 ? 0 : 1;
+
+ WT_ERR(__conn_statistics_config(session, cfg));
+ WT_ERR(__wt_lsm_manager_config(session, cfg));
+ WT_ERR(__wt_verbose_config(session, cfg));
+
+ /* Now that we know if verbose is configured, output the version. */
+ WT_ERR(__wt_verbose(
+ session, WT_VERB_VERSION, "%s", WIREDTIGER_VERSION_STRING));
+
+ /*
+ * Open the connection, then reset the local session as the real one
+ * was allocated in __wt_connection_open.
+ */
+ WT_ERR(__wt_connection_open(conn, cfg));
+ session = conn->default_session;
+
+ /*
+ * Check on the turtle and metadata files, creating them if necessary
+ * (which avoids application threads racing to create the metadata file
+ * later). Once the metadata file exists, get a reference to it in
+ * the connection's session.
+ */
+ WT_ERR(__wt_turtle_init(session));
+ WT_ERR(__wt_metadata_open(session));
+
+ /*
+ * Load the extensions after initialization completes; extensions expect
+ * everything else to be in place, and the extensions call back into the
+ * library.
+ */
+ WT_ERR(__conn_load_extensions(session, cfg));
+
+ /*
+ * We've completed configuration, write the base configuration file if
+ * we're creating the database.
+ */
+ if (conn->is_new) {
+ WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval));
+ if (cval.val)
+ WT_ERR(
+ __conn_write_config(session, WT_BASECONFIG, cfg));
+ }
+
+ /*
+ * Start the worker threads last.
+ */
+ WT_ERR(__wt_connection_workers(session, cfg));
+
+ /* Merge the final configuration for later reconfiguration. */
+ WT_ERR(__wt_config_merge(session, cfg, &conn->cfg));
+
+ WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
+ *wt_connp = &conn->iface;
+
+err: /* Discard the configuration strings. */
+ __wt_buf_free(session, &i1);
+ __wt_buf_free(session, &i2);
+ __wt_buf_free(session, &i3);
+
+ if (ret != 0 && conn != NULL)
+ WT_TRET(__wt_connection_close(conn));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache.c b/src/third_party/wiredtiger/src/conn/conn_cache.c
new file mode 100644
index 00000000000..079bd05ff1e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_cache.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cache_config --
+ * Configure the underlying cache.
+ */
+int
+__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CACHE *cache;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /*
+ * If not using a shared cache configure the cache size, otherwise
+ * check for a reserved size.
+ */
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) {
+ WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval));
+ conn->cache_size = (uint64_t)cval.val;
+ } else {
+ WT_RET(__wt_config_gets(
+ session, cfg, "shared_cache.reserve", &cval));
+ if (cval.val == 0)
+ WT_RET(__wt_config_gets(
+ session, cfg, "shared_cache.chunk", &cval));
+ cache->cp_reserved = (uint64_t)cval.val;
+ }
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction_target", &cval));
+ cache->eviction_target = (u_int)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval));
+ cache->eviction_trigger = (u_int)cval.val;
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval));
+ cache->eviction_dirty_target = (u_int)cval.val;
+
+ /*
+ * The eviction thread configuration options include the main eviction
+ * thread and workers. Our implementation splits them out. Adjust for
+ * the difference when parsing the configuration.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval));
+ WT_ASSERT(session, cval.val > 0);
+ conn->evict_workers_max = (u_int)cval.val - 1;
+
+ WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval));
+ WT_ASSERT(session, cval.val > 0);
+ conn->evict_workers_min = (u_int)cval.val - 1;
+
+ if (conn->evict_workers_min > conn->evict_workers_max)
+ WT_RET_MSG(session, EINVAL,
+ "eviction=(threads_min) cannot be greater than "
+ "eviction=(threads_max)");
+
+ return (0);
+}
+
+/*
+ * __wt_cache_create --
+ * Create the underlying cache.
+ */
+int
+__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, conn->cache == NULL ||
+ (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL));
+
+ WT_RET(__wt_calloc_def(session, 1, &conn->cache));
+
+ cache = conn->cache;
+
+ /* Use a common routine for run-time configuration options. */
+ WT_RET(__wt_cache_config(session, cfg));
+
+ /* Add the configured cache to the cache pool. */
+ if (F_ISSET(conn, WT_CONN_CACHE_POOL))
+ WT_RET(__wt_conn_cache_pool_open(session));
+
+ /*
+ * The target size must be lower than the trigger size or we will never
+ * get any work done.
+ */
+ if (cache->eviction_target >= cache->eviction_trigger)
+ WT_ERR_MSG(session, EINVAL,
+ "eviction target must be lower than the eviction trigger");
+
+ WT_ERR(__wt_cond_alloc(session,
+ "cache eviction server", 0, &cache->evict_cond));
+ WT_ERR(__wt_cond_alloc(session,
+ "eviction waiters", 0, &cache->evict_waiter_cond));
+ WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
+ WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));
+
+ /* Allocate the LRU eviction queue. */
+ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
+ WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));
+
+ /*
+ * We get/set some values in the cache statistics (rather than have
+ * two copies), configure them.
+ */
+ __wt_cache_stats_update(session);
+ return (0);
+
+err: WT_RET(__wt_cache_destroy(session));
+ return (ret);
+}
+
+/*
+ * __wt_cache_stats_update --
+ * Update the cache statistics for return to the application.
+ */
+void
+__wt_cache_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ stats = &conn->stats;
+
+ WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
+ WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));
+ WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
+ WT_STAT_SET(stats, cache_bytes_dirty, cache->bytes_dirty);
+ WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);
+}
+
+/*
+ * __wt_cache_destroy --
+ * Discard the underlying cache.
+ */
+int
+__wt_cache_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ if (cache == NULL)
+ return (0);
+
+ WT_TRET(__wt_cond_destroy(session, &cache->evict_cond));
+ WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond));
+ __wt_spin_destroy(session, &cache->evict_lock);
+ __wt_spin_destroy(session, &cache->evict_walk_lock);
+
+ __wt_free(session, cache->evict);
+ __wt_free(session, conn->cache);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
new file mode 100644
index 00000000000..ba80ac15267
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -0,0 +1,639 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Tuning constants.
+ */
+/* Threshold when a connection is allocated more cache */
+#define WT_CACHE_POOL_BUMP_THRESHOLD 6
+/* Threshold when a connection is allocated less cache */
+#define WT_CACHE_POOL_REDUCE_THRESHOLD 2
+/* Balancing passes after a bump before a connection is a candidate. */
+#define WT_CACHE_POOL_BUMP_SKIPS 10
+/* Balancing passes after a reduction before a connection is a candidate. */
+#define WT_CACHE_POOL_REDUCE_SKIPS 5
+
+static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *);
+static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *);
+static int __cache_pool_balance(WT_SESSION_IMPL *);
+
+/*
+ * __wt_cache_pool_config --
+ * Parse and setup the cache pool options.
+ */
+int
+__wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CACHE_POOL *cp;
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn, *entry;
+ WT_DECL_RET;
+ char *pool_name;
+ int created, reconfiguring;
+ uint64_t chunk, reserve, size, used_cache;
+
+ conn = S2C(session);
+ created = reconfiguring = 0;
+ pool_name = NULL;
+ cp = NULL;
+ size = 0;
+
+ if (F_ISSET(conn, WT_CONN_CACHE_POOL))
+ reconfiguring = 1;
+ else {
+ WT_RET(
+ __wt_config_gets(session, cfg, "shared_cache.name", &cval));
+ if (cval.len == 0) {
+ /*
+ * Tell the user if they configured some shared cache
+ * settings, but didn't enable it by naming it.
+ */
+ if (__wt_config_gets(session,
+ &cfg[1], "shared_cache", &cval) != WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "Shared cache configuration requires a "
+ "pool name");
+ return (0);
+ }
+ if (__wt_config_gets(session,
+ &cfg[1], "cache_size", &cval) != WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "Only one of cache_size and shared_cache can be "
+ "in the configuration");
+
+ /*
+ * NOTE: The allocations made when configuring and opening a
+ * cache pool don't really belong to the connection that
+ * allocates them. If a memory allocator becomes connection
+ * specific in the future we will need a way to allocate memory
+ * outside of the connection here.
+ */
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &pool_name));
+ }
+
+ __wt_spin_lock(session, &__wt_process.spinlock);
+ if (__wt_process.cache_pool == NULL) {
+ WT_ASSERT(session, !reconfiguring);
+ /* Create a cache pool. */
+ WT_ERR(__wt_calloc_def(session, 1, &cp));
+ created = 1;
+ cp->name = pool_name;
+ pool_name = NULL; /* Belongs to the cache pool now. */
+ TAILQ_INIT(&cp->cache_pool_qh);
+ WT_ERR(__wt_spin_init(
+ session, &cp->cache_pool_lock, "cache shared pool"));
+ WT_ERR(__wt_cond_alloc(session,
+ "cache pool server", 0, &cp->cache_pool_cond));
+
+ __wt_process.cache_pool = cp;
+ WT_ERR(__wt_verbose(session,
+ WT_VERB_SHARED_CACHE, "Created cache pool %s", cp->name));
+ } else if (!reconfiguring && !WT_STRING_MATCH(
+ __wt_process.cache_pool->name, pool_name, strlen(pool_name)))
+ /* Only a single cache pool is supported. */
+ WT_ERR_MSG(session, WT_ERROR,
+ "Attempting to join a cache pool that does not exist: %s",
+ pool_name);
+
+ cp = __wt_process.cache_pool;
+
+ /*
+ * The cache pool requires a reference count to avoid a race between
+ * configuration/open and destroy.
+ */
+ if (!reconfiguring)
+ ++cp->refs;
+
+ /*
+ * Cache pool configurations are optional when not creating. If
+ * values aren't being changed, retrieve the current value so that
+ * validation of settings works.
+ */
+ if (!created) {
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.size", &cval) == 0 && cval.val != 0)
+ size = (uint64_t)cval.val;
+ else
+ size = cp->size;
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.chunk", &cval) == 0 && cval.val != 0)
+ chunk = (uint64_t)cval.val;
+ else
+ chunk = cp->chunk;
+ } else {
+ /*
+ * The only time shared cache configuration uses default
+ * values is when we are creating the pool.
+ */
+ WT_ERR(__wt_config_gets(
+ session, cfg, "shared_cache.size", &cval));
+ WT_ASSERT(session, cval.val != 0);
+ size = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(
+ session, cfg, "shared_cache.chunk", &cval));
+ WT_ASSERT(session, cval.val != 0);
+ chunk = (uint64_t)cval.val;
+ }
+
+ /*
+ * Retrieve the reserve size here for validation of configuration.
+ * Don't save it yet since the connections cache is not created if
+ * we are opening. Cache configuration is responsible for saving the
+ * setting.
+ * The different conditions when reserved size are set are:
+ * - It's part of the users configuration - use that value.
+ * - We are reconfiguring - keep the previous value.
+ * - We are joining a cache pool for the first time (including
+ * creating the pool) - use the chunk size; that's the default.
+ */
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.reserve", &cval) == 0 && cval.val != 0)
+ reserve = (uint64_t)cval.val;
+ else if (reconfiguring)
+ reserve = conn->cache->cp_reserved;
+ else
+ reserve = chunk;
+
+ /*
+ * Validate that size and reserve values don't cause the cache
+ * pool to be over subscribed.
+ */
+ used_cache = 0;
+ if (!created) {
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
+ used_cache += entry->cache->cp_reserved;
+ }
+ if (used_cache + reserve > size)
+ WT_ERR_MSG(session, EINVAL,
+ "Shared cache unable to accommodate this configuration. "
+ "Shared cache size: %" PRIu64 ", reserved: %" PRIu64,
+ size, used_cache + reserve);
+
+ /* The configuration is verified - it's safe to update the pool. */
+ cp->size = size;
+ cp->chunk = chunk;
+
+ /* Wake up the cache pool server so any changes are noticed. */
+ if (reconfiguring)
+ WT_ERR(__wt_cond_signal(
+ session, __wt_process.cache_pool->cache_pool_cond));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Configured cache pool %s. Size: %" PRIu64
+ ", chunk size: %" PRIu64, cp->name, cp->size, cp->chunk));
+
+ F_SET(conn, WT_CONN_CACHE_POOL);
+err: __wt_spin_unlock(session, &__wt_process.spinlock);
+ if (!reconfiguring)
+ __wt_free(session, pool_name);
+ if (ret != 0 && created) {
+ __wt_free(session, cp->name);
+ WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_free(session, cp);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_conn_cache_pool_open --
+ * Add a connection to the cache pool.
+ */
+int
+__wt_conn_cache_pool_open(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CACHE_POOL *cp;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ cp = __wt_process.cache_pool;
+
+ /*
+ * Create a session that can be used by the cache pool thread, do
+ * it in the main thread to avoid shutdown races
+ */
+ if ((ret = __wt_open_internal_session(
+ conn, "cache-pool", 0, 0, &cache->cp_session)) != 0)
+ WT_RET_MSG(NULL, ret,
+ "Failed to create session for cache pool");
+
+ /*
+ * Add this connection into the cache pool connection queue. Figure
+ * out if a manager thread is needed while holding the lock. Don't
+ * start the thread until we have released the lock.
+ */
+ __wt_spin_lock(session, &cp->cache_pool_lock);
+ TAILQ_INSERT_TAIL(&cp->cache_pool_qh, conn, cpq);
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Added %s to cache pool %s", conn->home, cp->name));
+
+ /*
+ * Each connection participating in the cache pool starts a manager
+ * thread. Only one manager is active at a time, but having a thread
+ * in each connection saves having a complex election process when
+ * the active connection shuts down.
+ */
+ F_SET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+ F_SET(cache, WT_CACHE_POOL_RUN);
+ WT_RET(__wt_thread_create(session, &cache->cp_tid,
+ __wt_cache_pool_server, cache->cp_session));
+
+ /* Wake up the cache pool server to get our initial chunk. */
+ WT_RET(__wt_cond_signal(session, cp->cache_pool_cond));
+
+ return (0);
+}
+
+/*
+ * __wt_conn_cache_pool_destroy --
+ * Remove our resources from the shared cache pool. Remove the cache pool
+ * if we were the last connection.
+ */
+int
+__wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+ WT_CACHE_POOL *cp;
+ WT_CONNECTION_IMPL *conn, *entry;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ int cp_locked, found;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ cp_locked = found = 0;
+ cp = __wt_process.cache_pool;
+
+ if (!F_ISSET(conn, WT_CONN_CACHE_POOL))
+ return (0);
+
+ __wt_spin_lock(session, &cp->cache_pool_lock);
+ cp_locked = 1;
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq)
+ if (entry == conn) {
+ found = 1;
+ break;
+ }
+
+ /*
+ * If there was an error during open, we may not have made it onto the
+ * queue. We did increment the reference count, so proceed regardless.
+ */
+ if (found) {
+ WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Removing %s from cache pool", entry->home));
+ TAILQ_REMOVE(&cp->cache_pool_qh, entry, cpq);
+
+ /* Give the connection's resources back to the pool. */
+ WT_ASSERT(session, cp->currently_used >= conn->cache_size);
+ cp->currently_used -= conn->cache_size;
+
+ /*
+ * Stop our manager thread - release the cache pool lock while
+ * joining the thread to allow it to complete any balance
+ * operation.
+ */
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+ cp_locked = 0;
+
+ F_CLR(cache, WT_CACHE_POOL_RUN);
+ WT_TRET(__wt_cond_signal(session, cp->cache_pool_cond));
+ WT_TRET(__wt_thread_join(session, cache->cp_tid));
+
+ wt_session = &cache->cp_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ /*
+ * Grab the lock again now to stop other threads joining the
+ * pool while we are figuring out whether we were the last
+ * participant.
+ */
+ __wt_spin_lock(session, &cp->cache_pool_lock);
+ cp_locked = 1;
+ }
+
+ /*
+ * If there are no references, we are cleaning up after a failed
+ * wiredtiger_open, there is nothing further to do.
+ */
+ if (cp->refs < 1) {
+ if (cp_locked)
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+ return (0);
+ }
+
+ if (--cp->refs == 0) {
+ WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh));
+ F_CLR_ATOMIC(cp, WT_CACHE_POOL_ACTIVE);
+ }
+
+ if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE)) {
+ WT_TRET(__wt_verbose(
+ session, WT_VERB_SHARED_CACHE, "Destroying cache pool"));
+ __wt_spin_lock(session, &__wt_process.spinlock);
+ /*
+ * We have been holding the pool lock - no connections could
+ * have been added.
+ */
+ WT_ASSERT(session,
+ cp == __wt_process.cache_pool &&
+ TAILQ_EMPTY(&cp->cache_pool_qh));
+ __wt_process.cache_pool = NULL;
+ __wt_spin_unlock(session, &__wt_process.spinlock);
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+ cp_locked = 0;
+
+ /* Now free the pool. */
+ __wt_free(session, cp->name);
+
+ __wt_spin_destroy(session, &cp->cache_pool_lock);
+ WT_TRET(__wt_cond_destroy(session, &cp->cache_pool_cond));
+ __wt_free(session, cp);
+ }
+
+ if (cp_locked) {
+ __wt_spin_unlock(session, &cp->cache_pool_lock);
+
+ /* Notify other participants if we were managing */
+ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
+ F_CLR_ATOMIC(cp, WT_CACHE_POOL_MANAGED);
+ WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Shutting down shared cache manager connection"));
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __cache_pool_balance --
+ * Do a pass over the cache pool members and ensure the pool is being
+ * effectively used.
+ */
+static int
+__cache_pool_balance(WT_SESSION_IMPL *session)
+{
+ WT_CACHE_POOL *cp;
+ WT_DECL_RET;
+ int adjusted;
+ uint64_t bump_threshold, highest;
+
+ cp = __wt_process.cache_pool;
+ adjusted = 0;
+ highest = 0;
+
+ __wt_spin_lock(NULL, &cp->cache_pool_lock);
+
+ /* If the queue is empty there is nothing to do. */
+ if (TAILQ_FIRST(&cp->cache_pool_qh) == NULL)
+ goto err;
+
+ WT_ERR(__cache_pool_assess(session, &highest));
+ bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+ /*
+ * Actively attempt to:
+ * - Reduce the amount allocated, if we are over the budget
+ * - Increase the amount used if there is capacity and any pressure.
+ */
+ for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+ F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) {
+ WT_ERR(__cache_pool_adjust(
+ session, highest, bump_threshold, &adjusted));
+ /*
+ * Stop if the amount of cache being used is stable, and we
+ * aren't over capacity.
+ */
+ if (cp->currently_used <= cp->size && !adjusted)
+ break;
+ if (bump_threshold > 0)
+ --bump_threshold;
+ }
+
+err: __wt_spin_unlock(NULL, &cp->cache_pool_lock);
+ return (ret);
+}
+
+/*
+ * __cache_pool_assess --
+ * Assess the usage of the cache pool.
+ */
+static int
+__cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
+{
+ WT_CACHE_POOL *cp;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *entry;
+ uint64_t entries, highest, new;
+
+ cp = __wt_process.cache_pool;
+ entries = highest = 0;
+
+ /* Generate read pressure information. */
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ if (entry->cache_size == 0 ||
+ entry->cache == NULL)
+ continue;
+ cache = entry->cache;
+ ++entries;
+ new = cache->bytes_evict;
+ /* Handle wrapping of eviction requests. */
+ if (new >= cache->cp_saved_evict)
+ cache->cp_current_evict = new - cache->cp_saved_evict;
+ else
+ cache->cp_current_evict = new;
+ cache->cp_saved_evict = new;
+ if (cache->cp_current_evict > highest)
+ highest = cache->cp_current_evict;
+ }
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Highest eviction count: %" PRIu64 ", entries: %" PRIu64,
+ highest, entries));
+ /* Normalize eviction information across connections. */
+ highest = highest / (entries + 1);
+ ++highest; /* Avoid divide by zero. */
+
+ *phighest = highest;
+ return (0);
+}
+
+/*
+ * __cache_pool_adjust --
+ * Adjust the allocation of cache to each connection. If force is set
+ * ignore cache load information, and reduce the allocation for every
+ * connection allocated more than their reserved size.
+ */
+static int
+__cache_pool_adjust(WT_SESSION_IMPL *session,
+ uint64_t highest, uint64_t bump_threshold, int *adjustedp)
+{
+ WT_CACHE_POOL *cp;
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *entry;
+ uint64_t adjusted, reserved, read_pressure;
+ int force, grew;
+
+ *adjustedp = 0;
+ cp = __wt_process.cache_pool;
+ force = (cp->currently_used > cp->size);
+ grew = 0;
+ if (WT_VERBOSE_ISSET(session, WT_VERB_SHARED_CACHE)) {
+ WT_RET(__wt_verbose(session,
+ WT_VERB_SHARED_CACHE, "Cache pool distribution: "));
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "\t" "cache_size, read_pressure, skips: "));
+ }
+
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ cache = entry->cache;
+ reserved = cache->cp_reserved;
+ adjusted = 0;
+
+ read_pressure = cache->cp_current_evict / highest;
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32,
+ entry->cache_size, read_pressure, cache->cp_skip_count));
+
+ /* Allow to stabilize after changes. */
+ if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0)
+ continue;
+ /*
+ * If the entry is currently allocated less than the reserved
+ * size, increase it's allocation. This should only happen if:
+ * - It's the first time we've seen this member
+ * - The reserved size has been adjusted
+ */
+ if (entry->cache_size < reserved) {
+ grew = 1;
+ adjusted = reserved - entry->cache_size;
+ /*
+ * Conditions for reducing the amount of resources for an
+ * entry:
+ * - If we are forcing and this entry has more than the
+ * minimum amount of space in use.
+ * - If the read pressure in this entry is below the
+ * threshold, other entries need more cache, the entry has
+ * more than the minimum space and there is no available
+ * space in the pool.
+ */
+ } else if ((force && entry->cache_size > reserved) ||
+ (read_pressure < WT_CACHE_POOL_REDUCE_THRESHOLD &&
+ highest > 1 && entry->cache_size > reserved &&
+ cp->currently_used >= cp->size)) {
+ grew = 0;
+ /*
+ * Shrink by a chunk size if that doesn't drop us
+ * below the reserved size.
+ */
+ if (entry->cache_size > cp->chunk + reserved)
+ adjusted = cp->chunk;
+ else
+ adjusted = entry->cache_size - reserved;
+ /*
+ * Conditions for increasing the amount of resources for an
+ * entry:
+ * - There was some activity across the pool
+ * - This entry is using less than the entire cache pool
+ * - The connection is using enough cache to require eviction
+ * - There is space available in the pool
+ * - Additional cache would benefit the connection
+ */
+ } else if (highest > 1 &&
+ entry->cache_size < cp->size &&
+ cache->bytes_inmem >=
+ (entry->cache_size * cache->eviction_target) / 100 &&
+ cp->currently_used < cp->size &&
+ read_pressure > bump_threshold) {
+ grew = 1;
+ adjusted = WT_MIN(cp->chunk,
+ cp->size - cp->currently_used);
+ }
+ if (adjusted > 0) {
+ *adjustedp = 1;
+ if (grew > 0) {
+ cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS;
+ entry->cache_size += adjusted;
+ cp->currently_used += adjusted;
+ } else {
+ cache->cp_skip_count =
+ WT_CACHE_POOL_REDUCE_SKIPS;
+ WT_ASSERT(session,
+ entry->cache_size >= adjusted &&
+ cp->currently_used >= adjusted);
+ entry->cache_size -= adjusted;
+ cp->currently_used -= adjusted;
+ }
+ WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Allocated %s%" PRId64 " to %s",
+ grew ? "" : "-", adjusted, entry->home));
+ /*
+ * TODO: Add a loop waiting for connection to give up
+ * cache.
+ */
+ }
+ }
+ return (0);
+}
+
+/*
+ * __wt_cache_pool_server --
+ * Thread to manage cache pool among connections.
+ */
+void *
+__wt_cache_pool_server(void *arg)
+{
+ WT_CACHE *cache;
+ WT_CACHE_POOL *cp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)arg;
+
+ cp = __wt_process.cache_pool;
+ cache = S2C(session)->cache;
+
+ while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(cache, WT_CACHE_POOL_RUN)) {
+ if (cp->currently_used <= cp->size)
+ WT_ERR(__wt_cond_wait(session,
+ cp->cache_pool_cond, 1000000));
+
+ /*
+ * Re-check pool run flag - since we want to avoid getting the
+ * lock on shutdown.
+ */
+ if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(cache, WT_CACHE_POOL_RUN))
+ break;
+
+ /* Try to become the managing thread */
+ F_CAS_ATOMIC(cp, WT_CACHE_POOL_MANAGED, ret);
+ if (ret == 0) {
+ F_SET(cache, WT_CACHE_POOL_MANAGER);
+ WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE,
+ "Cache pool switched manager thread"));
+ }
+
+ /*
+ * Continue even if there was an error. Details of errors are
+ * reported in the balance function.
+ */
+ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER))
+ (void)__cache_pool_balance(session);
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "cache pool manager server error");
+ }
+ return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_ckpt.c b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
new file mode 100644
index 00000000000..ab97d4ead46
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_ckpt.c
@@ -0,0 +1,228 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_server_start(WT_CONNECTION_IMPL *);
+
+/*
+ * __ckpt_server_config --
+ * Parse and setup the checkpoint server options.
+ */
+static int
+__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ char *p;
+
+ conn = S2C(session);
+
+ /*
+ * The checkpoint configuration requires a wait time and/or a log
+ * size -- if one is not set, we're not running at all.
+ * Checkpoints based on log size also require logging be enabled.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
+ conn->ckpt_usecs = (long)cval.val * 1000000;
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval));
+ conn->ckpt_logsize = (wt_off_t)cval.val;
+ __wt_log_written_reset(session);
+ if ((conn->ckpt_usecs == 0 && conn->ckpt_logsize == 0) ||
+ (conn->ckpt_logsize && !conn->logging && conn->ckpt_usecs == 0)) {
+ *startp = 0;
+ return (0);
+ }
+ *startp = 1;
+
+ /*
+ * The application can specify a checkpoint name, which we ignore if
+ * it's our default.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval));
+ if (cval.len != 0 &&
+ !WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+ WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+
+ WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp));
+ WT_ERR(__wt_buf_fmt(
+ session, tmp, "name=%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strdup(session, tmp->data, &p));
+
+ __wt_free(session, conn->ckpt_config);
+ conn->ckpt_config = p;
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __ckpt_server --
+ * The checkpoint server thread.
+ */
+static void *
+__ckpt_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+ wt_session = (WT_SESSION *)session;
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) {
+ /* Checkpoint the database. */
+ WT_ERR(wt_session->checkpoint(wt_session, conn->ckpt_config));
+
+ /* Reset. */
+ if (conn->ckpt_logsize) {
+ __wt_log_written_reset(session);
+ conn->ckpt_signalled = 0;
+ }
+ /*
+ * Wait...
+ * NOTE: If the user only configured logsize, then usecs
+ * will be 0 and this wait won't return until signalled.
+ */
+ WT_ERR(
+ __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "checkpoint server error");
+ }
+ return (NULL);
+}
+
+/*
+ * __ckpt_server_start --
+ * Start the checkpoint server thread.
+ */
+static int
+__ckpt_server_start(WT_CONNECTION_IMPL *conn)
+{
+ WT_SESSION_IMPL *session;
+
+ /* Nothing to do if the server is already running. */
+ if (conn->ckpt_session != NULL)
+ return (0);
+
+ F_SET(conn, WT_CONN_SERVER_CHECKPOINT);
+ /* The checkpoint server gets its own session. */
+ WT_RET(__wt_open_internal_session(
+ conn, "checkpoint-server", 1, 1, &conn->ckpt_session));
+ session = conn->ckpt_session;
+
+ /*
+ * Checkpoint does enough I/O it may be called upon to perform slow
+ * operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT);
+
+ WT_RET(
+ __wt_cond_alloc(session, "checkpoint server", 0, &conn->ckpt_cond));
+
+ /*
+ * Start the thread.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->ckpt_tid, __ckpt_server, session));
+ conn->ckpt_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_server_create --
+ * Configure and start the checkpoint server.
+ */
+int
+__wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ int start;
+
+ conn = S2C(session);
+ start = 0;
+
+ /* If there is already a server running, shut it down. */
+ if (conn->ckpt_session != NULL)
+ WT_RET(__wt_checkpoint_server_destroy(session));
+
+ WT_RET(__ckpt_server_config(session, cfg, &start));
+ if (start)
+ WT_RET(__ckpt_server_start(conn));
+
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_server_destroy --
+ * Destroy the checkpoint server thread.
+ */
+int
+__wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
+ if (conn->ckpt_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->ckpt_cond));
+ WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
+ conn->ckpt_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond));
+
+ __wt_free(session, conn->ckpt_config);
+
+ /* Close the server thread's session. */
+ if (conn->ckpt_session != NULL) {
+ wt_session = &conn->ckpt_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+
+ /*
+ * Ensure checkpoint settings are cleared - so that reconfigure doesn't
+ * get confused.
+ */
+ conn->ckpt_session = NULL;
+ conn->ckpt_tid_set = 0;
+ conn->ckpt_cond = NULL;
+ conn->ckpt_config = NULL;
+ conn->ckpt_usecs = 0;
+
+ return (ret);
+}
+
+/*
+ * __wt_checkpoint_signal --
+ * Signal the checkpoint thread if sufficient log has been written.
+ * Return 1 if this signals the checkpoint thread, 0 otherwise.
+ */
+int
+__wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ WT_ASSERT(session, WT_CKPT_LOGSIZE(conn));
+ if (logsize >= conn->ckpt_logsize && !conn->ckpt_signalled) {
+ WT_RET(__wt_cond_signal(session, conn->ckpt_cond));
+ conn->ckpt_signalled = 1;
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
new file mode 100644
index 00000000000..f4f540e33c7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __conn_dhandle_open_lock --
+ * Spin on the current data handle until either (a) it is open, read
+ * locked; or (b) it is closed, write locked. If exclusive access is
+ * requested and cannot be granted immediately, fail with EBUSY.
+ */
+static int
+__conn_dhandle_open_lock(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+
+ btree = dhandle->handle;
+
+ /*
+ * Check that the handle is open. We've already incremented
+ * the reference count, so once the handle is open it won't be
+ * closed by another thread.
+ *
+ * If we can see the WT_DHANDLE_OPEN flag set while holding a
+ * lock on the handle, then it's really open and we can start
+ * using it. Alternatively, if we can get an exclusive lock
+ * and WT_DHANDLE_OPEN is still not set, we need to do the open.
+ */
+ for (;;) {
+ if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE) &&
+ F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+ return (EBUSY);
+
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ WT_RET(__wt_readlock(session, dhandle->rwlock));
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ return (0);
+ WT_RET(__wt_readunlock(session, dhandle->rwlock));
+ }
+
+ /*
+ * It isn't open or we want it exclusive: try to get an
+ * exclusive lock. There is some subtlety here: if we race
+ * with another thread that successfully opens the file, we
+ * don't want to block waiting to get exclusive access.
+ */
+ if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) {
+ /*
+ * If it was opened while we waited, drop the write
+ * lock and get a read lock instead.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ !LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ WT_RET(
+ __wt_writeunlock(session, dhandle->rwlock));
+ continue;
+ }
+
+ /* We have an exclusive lock, we're done. */
+ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+ return (0);
+ } else if (ret != EBUSY || LF_ISSET(WT_DHANDLE_EXCLUSIVE))
+ return (EBUSY);
+
+ /* Give other threads a chance to make progress. */
+ __wt_yield();
+ }
+}
+
+/*
+ * __conn_dhandle_get --
+ * Find an open btree file handle, otherwise create a new one, lock it
+ * exclusively, and return it linked into the connection's list.
+ */
+static int
+__conn_dhandle_get(WT_SESSION_IMPL *session,
+ const char *name, const char *ckpt, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ uint64_t hash;
+
+ conn = S2C(session);
+
+ /* We must be holding the schema lock at a higher level. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+ !LF_ISSET(WT_DHANDLE_HAVE_REF));
+
+ /* Increment the reference count if we already have the btree open. */
+ hash = __wt_hash_city64(name, strlen(name));
+ SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ if ((hash == dhandle->name_hash &&
+ strcmp(name, dhandle->name) == 0) &&
+ ((ckpt == NULL && dhandle->checkpoint == NULL) ||
+ (ckpt != NULL && dhandle->checkpoint != NULL &&
+ strcmp(ckpt, dhandle->checkpoint) == 0))) {
+ WT_RET(__conn_dhandle_open_lock(
+ session, dhandle, flags));
+ (void)WT_ATOMIC_ADD4(dhandle->session_ref, 1);
+ session->dhandle = dhandle;
+ return (0);
+ }
+
+ /*
+ * Allocate the data source handle and underlying btree handle, then
+ * initialize the data source handle. Exclusively lock the data
+ * source handle before inserting it in the list.
+ */
+ WT_RET(__wt_calloc_def(session, 1, &dhandle));
+
+ WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));
+ dhandle->session_ref = 1;
+
+ dhandle->name_hash = hash;
+ WT_ERR(__wt_strdup(session, name, &dhandle->name));
+ if (ckpt != NULL)
+ WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));
+
+ WT_ERR(__wt_calloc_def(session, 1, &btree));
+ dhandle->handle = btree;
+ btree->dhandle = dhandle;
+
+ WT_ERR(__wt_spin_init(
+ session, &dhandle->close_lock, "data handle close"));
+
+ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_ERR(__wt_writelock(session, dhandle->rwlock));
+
+ /*
+ * Prepend the handle to the connection list, assuming we're likely to
+ * need new files again soon, until they are cached by all sessions.
+ *
+ * !!!
+ * We hold only the schema lock here, not the dhandle lock. Eviction
+ * walks this list only holding the dhandle lock. This works because
+ * we're inserting at the beginning of the list, and we're only
+ * publishing this one entry per lock acquisition. Eviction either
+ * sees our newly added entry or the former head of the list, and it
+ * doesn't matter which (if eviction only sees a single element in the
+ * list because the insert races, it will return without finding enough
+ * candidates for eviction, and will then retry).
+ */
+ SLIST_INSERT_HEAD(&conn->dhlh, dhandle, l);
+
+ session->dhandle = dhandle;
+ return (0);
+
+err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
+ __wt_free(session, dhandle->name);
+ __wt_free(session, dhandle->checkpoint);
+ __wt_free(session, dhandle->handle); /* btree free */
+ __wt_spin_destroy(session, &dhandle->close_lock);
+ __wt_overwrite_and_free(session, dhandle);
+
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_sync_and_close --
+ * Sync and close the underlying btree handle.
+ */
+int
+__wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force)
+{
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ int no_schema_lock;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ if (!F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ return (0);
+
+ /*
+ * If we don't already have the schema lock, make it an error to try
+ * to acquire it. The problem is that we are holding an exclusive
+ * lock on the handle, and if we attempt to acquire the schema lock
+ * we might deadlock with a thread that has the schema lock and wants
+ * a handle lock (specifically, checkpoint).
+ */
+ no_schema_lock = 0;
+ if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+ no_schema_lock = 1;
+ F_SET(session, WT_SESSION_NO_SCHEMA_LOCK);
+ }
+
+ /*
+ * We may not be holding the schema lock, and threads may be walking
+ * the list of open handles (for example, checkpoint). Acquire the
+ * handle's close lock.
+ */
+ __wt_spin_lock(session, &dhandle->close_lock);
+
+ /*
+ * The close can fail if an update cannot be written, return the EBUSY
+ * error to our caller for eventual retry.
+ */
+ if (!F_ISSET(btree,
+ WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY))
+ WT_ERR(__wt_checkpoint_close(session, force));
+
+ if (dhandle->checkpoint == NULL)
+ --S2C(session)->open_btree_count;
+
+ WT_TRET(__wt_btree_close(session));
+ F_CLR(dhandle, WT_DHANDLE_OPEN);
+ F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+
+err: __wt_spin_unlock(session, &dhandle->close_lock);
+
+ if (no_schema_lock)
+ F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK);
+
+ return (ret);
+}
+
+/*
+ * __conn_btree_config_clear --
+ * Clear the underlying object's configuration information.
+ */
+static void
+__conn_btree_config_clear(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ const char **a;
+
+ dhandle = session->dhandle;
+
+ if (dhandle->cfg == NULL)
+ return;
+ for (a = dhandle->cfg; *a != NULL; ++a)
+ __wt_free(session, *a);
+ __wt_free(session, dhandle->cfg);
+}
+
+/*
+ * __conn_btree_config_set --
+ * Set up a btree handle's configuration information.
+ */
+static int
+__conn_btree_config_set(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ const char *metaconf;
+
+ dhandle = session->dhandle;
+
+ /*
+ * Read the object's entry from the metadata file, we're done if we
+ * don't find one.
+ */
+ if ((ret =
+ __wt_metadata_search(session, dhandle->name, &metaconf)) != 0) {
+ if (ret == WT_NOTFOUND)
+ ret = ENOENT;
+ WT_RET(ret);
+ }
+
+ /*
+ * The defaults are included because underlying objects have persistent
+ * configuration information stored in the metadata file. If defaults
+ * are included in the configuration, we can add new configuration
+ * strings without upgrading the metadata file or writing special code
+ * in case a configuration string isn't initialized, as long as the new
+ * configuration string has an appropriate default value.
+ *
+ * The error handling is a little odd, but be careful: we're holding a
+ * chunk of allocated memory in metaconf. If we fail before we copy a
+ * reference to it into the object's configuration array, we must free
+ * it, after the copy, we don't want to free it.
+ */
+ WT_ERR(__wt_calloc_def(session, 3, &dhandle->cfg));
+ WT_ERR(__wt_strdup(
+ session, WT_CONFIG_BASE(session, file_meta), &dhandle->cfg[0]));
+ dhandle->cfg[1] = metaconf;
+ return (0);
+
+err: __wt_free(session, metaconf);
+ return (ret);
+}
+
+/*
+ * __conn_btree_open --
+ * Open the current btree handle.
+ */
+static int
+__conn_btree_open(
+ WT_SESSION_IMPL *session, const char *op_cfg[], uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+ btree = S2BT(session);
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) &&
+ F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) &&
+ !LF_ISSET(WT_DHANDLE_LOCK_ONLY));
+
+ /*
+ * If the handle is already open, it has to be closed so it can be
+ * reopened with a new configuration. We don't need to check again:
+ * this function isn't called if the handle is already open in the
+ * required mode.
+ *
+ * This call can return EBUSY if there's an update in the object that's
+ * not yet globally visible. That's not a problem because it can only
+ * happen when we're switching from a normal handle to a "special" one,
+ * so we're returning EBUSY to an attempt to verify or do other special
+ * operations. The reverse won't happen because when the handle from a
+ * verify or other special operation is closed, there won't be updates
+ * in the tree that can block the close.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ WT_RET(__wt_conn_btree_sync_and_close(session, 0));
+
+ /* Discard any previous configuration, set up the new configuration. */
+ __conn_btree_config_clear(session);
+ WT_RET(__conn_btree_config_set(session));
+
+ /* Set any special flags on the handle. */
+ F_SET(btree, LF_ISSET(WT_BTREE_SPECIAL_FLAGS));
+
+ do {
+ WT_ERR(__wt_btree_open(session, op_cfg));
+ F_SET(dhandle, WT_DHANDLE_OPEN);
+ /*
+ * Checkpoint handles are read only, so eviction calculations
+ * based on the number of btrees are better to ignore them.
+ */
+ if (dhandle->checkpoint == NULL)
+ ++S2C(session)->open_btree_count;
+
+ /* Drop back to a readlock if that is all that was needed. */
+ if (!LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_ERR(__wt_writeunlock(session, dhandle->rwlock));
+ WT_ERR(
+ __conn_dhandle_open_lock(session, dhandle, flags));
+ }
+ } while (!F_ISSET(dhandle, WT_DHANDLE_OPEN));
+
+ if (0) {
+err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
+ /*
+ * If the open failed, close the handle. If there was no
+ * reference to the handle in this session, we incremented the
+ * session reference count, so decrement it here. Otherwise,
+ * just close the handle without decrementing.
+ */
+ if (!LF_ISSET(WT_DHANDLE_HAVE_REF))
+ __wt_conn_btree_close(session);
+ else if (F_ISSET(dhandle, WT_DHANDLE_OPEN))
+ WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_get --
+ * Get an open btree file handle, otherwise open a new one.
+ */
+int
+__wt_conn_btree_get(WT_SESSION_IMPL *session,
+ const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ if (LF_ISSET(WT_DHANDLE_HAVE_REF))
+ WT_RET(
+ __conn_dhandle_open_lock(session, session->dhandle, flags));
+ else
+ WT_RET(__conn_dhandle_get(session, name, ckpt, flags));
+ dhandle = session->dhandle;
+
+ if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) &&
+ (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
+ LF_ISSET(WT_BTREE_SPECIAL_FLAGS)))
+ if ((ret = __conn_btree_open(session, op_cfg, flags)) != 0) {
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ }
+
+ WT_ASSERT(session, ret != 0 ||
+ LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+ F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
+
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_apply --
+ * Apply a function to all open btree handles apart from the metadata
+ * file.
+ */
+int
+__wt_conn_btree_apply(WT_SESSION_IMPL *session,
+ int apply_checkpoints,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
+ WT_PREFIX_MATCH(dhandle->name, "file:") &&
+ (apply_checkpoints || dhandle->checkpoint == NULL) &&
+ !WT_IS_METADATA(dhandle)) {
+ /*
+ * We need to pull the handle into the session handle
+ * cache and make sure it's referenced to stop other
+ * internal code dropping the handle (e.g in LSM when
+ * cleaning up obsolete chunks). Holding the metadata
+ * lock isn't enough.
+ */
+ ret = __wt_session_get_btree(session,
+ dhandle->name, dhandle->checkpoint, NULL, 0);
+ if (ret == 0) {
+ ret = func(session, cfg);
+ if (WT_META_TRACKING(session))
+ WT_TRET(__wt_meta_track_handle_lock(
+ session, 0));
+ else
+ WT_TRET(__wt_session_release_btree(
+ session));
+ } else if (ret == EBUSY)
+ ret = __wt_conn_btree_apply_single(
+ session, dhandle->name,
+ dhandle->checkpoint, func, cfg);
+ WT_RET(ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_conn_btree_apply_single --
+ * Apply a function to a single btree handle that couldn't be locked
+ * (attempting to get the handle returned EBUSY).
+ */
+int
+__wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+ saved_dhandle = session->dhandle;
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ if (strcmp(dhandle->name, uri) == 0 &&
+ ((dhandle->checkpoint == NULL && checkpoint == NULL) ||
+ (dhandle->checkpoint != NULL && checkpoint != NULL &&
+ strcmp(dhandle->checkpoint, checkpoint) == 0))) {
+ /*
+ * We're holding the schema lock which locks out handle
+ * open (which might change the state of the underlying
+ * object). However, closing a handle doesn't require
+ * the schema lock, lock out closing the handle and then
+ * confirm the handle is still open.
+ */
+ __wt_spin_lock(session, &dhandle->close_lock);
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ session->dhandle = dhandle;
+ ret = func(session, cfg);
+ }
+ __wt_spin_unlock(session, &dhandle->close_lock);
+ WT_ERR(ret);
+ }
+
+err: session->dhandle = saved_dhandle;
+ return (ret);
+}
+
+/*
+ * __wt_conn_btree_close --
+ * Discard a reference to an open btree file handle.
+ */
+void
+__wt_conn_btree_close(WT_SESSION_IMPL *session)
+{
+ (void)WT_ATOMIC_SUB4(session->dhandle->session_ref, 1);
+}
+
+/*
+ * __wt_conn_dhandle_close_all --
+ * Close all data handles handles with matching name (including all
+ * checkpoint handles).
+ */
+int
+__wt_conn_dhandle_close_all(
+ WT_SESSION_IMPL *session, const char *name, int force)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_ASSERT(session, session->dhandle == NULL);
+
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (strcmp(dhandle->name, name) != 0)
+ continue;
+
+ session->dhandle = dhandle;
+
+ /* Lock the handle exclusively. */
+ WT_ERR(__wt_session_get_btree(session,
+ dhandle->name, dhandle->checkpoint,
+ NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_handle_lock(session, 0));
+
+ /*
+ * We have an exclusive lock, which means there are no cursors
+ * open at this point. Close the handle, if necessary.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ if ((ret = __wt_meta_track_sub_on(session)) == 0)
+ ret = __wt_conn_btree_sync_and_close(
+ session, force);
+
+ /*
+ * If the close succeeded, drop any locks it acquired.
+ * If there was a failure, this function will fail and
+ * the whole transaction will be rolled back.
+ */
+ if (ret == 0)
+ ret = __wt_meta_track_sub_off(session);
+ }
+
+ if (!WT_META_TRACKING(session))
+ WT_TRET(__wt_session_release_btree(session));
+
+ WT_ERR(ret);
+ }
+
+err: session->dhandle = NULL;
+ return (ret);
+}
+
+/*
+ * __wt_conn_dhandle_discard_single --
+ * Close/discard a single data handle.
+ */
+int
+__wt_conn_dhandle_discard_single(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *save_dhandle;
+ WT_DECL_RET;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ conn = S2C(session);
+
+ save_dhandle = session->dhandle;
+ session->dhandle = dhandle;
+
+ /*
+ * We're called from the periodic sweep function and the final close;
+ * the former wants to continue if the handle is suddenly found to be
+ * busy, the latter wants to shut things down.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ if (!final)
+ WT_ERR(EBUSY);
+ WT_ERR(__wt_conn_btree_sync_and_close(session, 0));
+ }
+
+ /*
+ * Get the schema lock (required to remove entries from the data handle
+ * list), get the dhandle lock to block the eviction server from
+ * walking the list.
+ */
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+ __wt_spin_lock(session, &conn->schema_lock);
+
+ /*
+ * If the eviction server is running, don't block waiting for it while
+ * holding the schema lock. The sweep server will try again.
+ */
+ if (final)
+ __wt_spin_lock(session, &conn->dhandle_lock);
+ else if ((ret =
+ __wt_spin_trylock(session, &conn->dhandle_lock, &id)) != 0)
+ goto unlock;
+
+ /*
+ * Check if the handle was reacquired by a session while we waited;
+ * this should only happen when called from the periodic sweep code, of
+ * course.
+ */
+ if (!final && dhandle->session_ref != 0)
+ ret = EBUSY;
+ else
+ SLIST_REMOVE(&conn->dhlh, dhandle, __wt_data_handle, l);
+
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+
+unlock: __wt_spin_unlock(session, &conn->schema_lock);
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+
+ /*
+ * After successfully removing the handle, clean it up.
+ */
+ if (ret == 0) {
+ WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
+ __wt_free(session, dhandle->name);
+ __wt_free(session, dhandle->checkpoint);
+ __conn_btree_config_clear(session);
+ __wt_free(session, dhandle->handle);
+ __wt_spin_destroy(session, &dhandle->close_lock);
+ __wt_overwrite_and_free(session, dhandle);
+
+ WT_CLEAR_BTREE_IN_SESSION(session);
+ }
+
+err: session->dhandle = save_dhandle;
+ WT_ASSERT(session, !final || ret == 0);
+ return (ret);
+}
+
+/*
+ * __wt_conn_dhandle_discard --
+ * Close/discard all data handles.
+ */
+int
+__wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ /*
+ * Close open data handles: first, everything but the metadata file
+ * (as closing a normal file may open and write the metadata file),
+ * then the metadata file. This function isn't called often, and I
+ * don't want to "know" anything about the metadata file's position on
+ * the list, so we do it the hard way.
+ */
+restart:
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (WT_IS_METADATA(dhandle))
+ continue;
+
+ WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1));
+ goto restart;
+ }
+
+ /*
+ * Closing the files may have resulted in entries on our default
+ * session's list of open data handles, specifically, we added the
+ * metadata file if any of the files were dirty. Clean up that list
+ * before we shut down the metadata entry, for good.
+ */
+ __wt_session_close_cache(session);
+ F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+
+ /* Close the metadata file handle. */
+ while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL)
+ WT_TRET(__wt_conn_dhandle_discard_single(session, dhandle, 1));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c
new file mode 100644
index 00000000000..e4f0a6ddd73
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_handle.c
@@ -0,0 +1,142 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_connection_init --
+ * Structure initialization for a just-created WT_CONNECTION_IMPL handle.
+ */
+int
+__wt_connection_init(WT_CONNECTION_IMPL *conn)
+{
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = conn->default_session;
+
+ SLIST_INIT(&conn->dhlh); /* Data handle list */
+ TAILQ_INIT(&conn->dlhqh); /* Library list */
+ TAILQ_INIT(&conn->dsrcqh); /* Data source list */
+ TAILQ_INIT(&conn->fhqh); /* File list */
+ TAILQ_INIT(&conn->collqh); /* Collator list */
+ TAILQ_INIT(&conn->compqh); /* Compressor list */
+
+ TAILQ_INIT(&conn->lsmqh); /* WT_LSM_TREE list */
+
+ /* Setup the LSM work queues. */
+ TAILQ_INIT(&conn->lsm_manager.switchqh);
+ TAILQ_INIT(&conn->lsm_manager.appqh);
+ TAILQ_INIT(&conn->lsm_manager.managerqh);
+
+ /* Configuration. */
+ WT_RET(__wt_conn_config_init(session));
+
+ /* Statistics. */
+ __wt_stat_init_connection_stats(&conn->stats);
+
+ /* Locks. */
+ WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
+ WT_RET(__wt_spin_init(session, &conn->checkpoint_lock, "checkpoint"));
+ WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle"));
+ WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
+ WT_RET(__wt_spin_init(session, &conn->hot_backup_lock, "hot backup"));
+ WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
+ WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
+ WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock));
+ for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+ WT_RET(
+ __wt_spin_init(session, &conn->page_lock[i], "btree page"));
+
+ /* Setup the spin locks for the LSM manager queues. */
+ WT_RET(__wt_spin_init(session,
+ &conn->lsm_manager.app_lock, "LSM application queue lock"));
+ WT_RET(__wt_spin_init(session,
+ &conn->lsm_manager.manager_lock, "LSM manager queue lock"));
+ WT_RET(__wt_spin_init(
+ session, &conn->lsm_manager.switch_lock, "LSM switch queue lock"));
+ WT_RET(__wt_cond_alloc(
+ session, "LSM worker cond", 0, &conn->lsm_manager.work_cond));
+
+ /*
+ * Generation numbers.
+ *
+ * Start split generations at one. Threads publish this generation
+ * number before examining tree structures, and zero when they leave.
+ * We need to distinguish between threads that are in a tree before the
+ * first split has happened, and threads that are not in a tree.
+ */
+ conn->split_gen = 1;
+
+ /*
+ * Block manager.
+ * XXX
+ * If there's ever a second block manager, we'll want to make this
+ * more opaque, but for now this is simpler.
+ */
+ WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager"));
+ TAILQ_INIT(&conn->blockqh); /* Block manager list */
+
+ return (0);
+}
+
+/*
+ * __wt_connection_destroy --
+ * Destroy the connection's underlying WT_CONNECTION_IMPL structure.
+ */
+int
+__wt_connection_destroy(WT_CONNECTION_IMPL *conn)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ /* Check there's something to destroy. */
+ if (conn == NULL)
+ return (0);
+
+ session = conn->default_session;
+
+ /*
+ * Close remaining open files (before discarding the mutex, the
+ * underlying file-close code uses the mutex to guard lists of
+ * open files.
+ */
+ if (conn->lock_fh != NULL)
+ WT_TRET(__wt_close(session, conn->lock_fh));
+
+ /* Remove from the list of connections. */
+ __wt_spin_lock(session, &__wt_process.spinlock);
+ TAILQ_REMOVE(&__wt_process.connqh, conn, q);
+ __wt_spin_unlock(session, &__wt_process.spinlock);
+
+ /* Configuration */
+ __wt_conn_config_discard(session); /* configuration */
+
+ __wt_conn_foc_discard(session); /* free-on-close */
+
+ __wt_spin_destroy(session, &conn->api_lock);
+ __wt_spin_destroy(session, &conn->block_lock);
+ __wt_spin_destroy(session, &conn->checkpoint_lock);
+ __wt_spin_destroy(session, &conn->dhandle_lock);
+ __wt_spin_destroy(session, &conn->fh_lock);
+ __wt_spin_destroy(session, &conn->hot_backup_lock);
+ __wt_spin_destroy(session, &conn->reconfig_lock);
+ __wt_spin_destroy(session, &conn->schema_lock);
+ for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+ __wt_spin_destroy(session, &conn->page_lock[i]);
+ __wt_free(session, conn->page_lock);
+
+ /* Free allocated memory. */
+ __wt_free(session, conn->cfg);
+ __wt_free(session, conn->home);
+ __wt_free(session, conn->error_prefix);
+ __wt_free(session, conn->sessions);
+
+ __wt_free(NULL, conn);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
new file mode 100644
index 00000000000..e516fdc68d2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -0,0 +1,284 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __logmgr_sync_cfg --
+ * Interpret the transaction_sync config.
+ */
+static int
+__logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ WT_RET(
+ __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval));
+ if (cval.val)
+ FLD_SET(conn->txn_logsync, WT_LOG_FLUSH);
+ else
+ FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH);
+
+ WT_RET(
+ __wt_config_gets(session, cfg, "transaction_sync.method", &cval));
+ FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC);
+ if (WT_STRING_MATCH("dsync", cval.str, cval.len))
+ FLD_SET(conn->txn_logsync, WT_LOG_DSYNC);
+ else if (WT_STRING_MATCH("fsync", cval.str, cval.len))
+ FLD_SET(conn->txn_logsync, WT_LOG_FSYNC);
+ return (0);
+}
+
+/*
+ * __logmgr_config --
+ * Parse and setup the logging server options.
+ */
+static int
+__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * The logging configuration is off by default.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+ *runp = cval.val != 0;
+ if (*runp == 0)
+ return (0);
+
+ WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval));
+ conn->archive = cval.val != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
+ conn->log_file_max = (wt_off_t)cval.val;
+ WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max);
+
+ WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
+ WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path));
+
+ WT_RET(__logmgr_sync_cfg(session, cfg));
+ return (0);
+}
+
+/*
+ * __log_archive_server --
+ * The log archiving server thread.
+ */
+static void *
+__log_archive_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LSN lsn;
+ WT_SESSION_IMPL *session;
+ uint32_t lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ session = arg;
+ conn = S2C(session);
+ log = conn->log;
+ logcount = 0;
+ logfiles = NULL;
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ /*
+ * If archiving is reconfigured and turned off, wait until it
+ * gets turned back on and check again. Don't wait forever: if
+ * a notification gets lost during close, we want to find out
+ * eventually.
+ */
+ if (conn->archive == 0 ||
+ __wt_try_writelock(session, log->log_archive_lock) != 0) {
+ if (conn->archive != 0) {
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_archive: Blocked due to open log "
+ "cursor holding archive lock"));
+ }
+ WT_ERR(
+ __wt_cond_wait(session, conn->arch_cond, 1000000));
+ continue;
+ }
+
+ lsn = log->ckpt_lsn;
+ lsn.offset = 0;
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_archive: ckpt LSN %" PRIu32 ",%" PRIu64,
+ lsn.file, lsn.offset));
+ /*
+ * Main archive code. Get the list of all log files and
+ * remove any earlier than the checkpoint LSN.
+ */
+ WT_ERR(__wt_dirlist(session, conn->log_path,
+ WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount));
+
+ /*
+ * We can only archive files if a hot backup is not in progress.
+ */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ for (i = 0; i < logcount; i++) {
+ if (conn->hot_backup == 0) {
+ WT_ERR(__wt_log_extract_lognum(
+ session, logfiles[i], &lognum));
+ if (lognum < lsn.file)
+ WT_ERR(
+ __wt_log_remove(session, lognum));
+ }
+ }
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+ __wt_log_files_free(session, logfiles, logcount);
+ logfiles = NULL;
+ logcount = 0;
+
+ /*
+ * Indicate what is our new earliest LSN. It is the start
+ * of the log file containing the last checkpoint.
+ */
+ log->first_lsn = lsn;
+ log->first_lsn.offset = 0;
+ WT_ERR(__wt_writeunlock(session, log->log_archive_lock));
+
+ /* Wait until the next event. */
+ WT_ERR(__wt_cond_wait(session, conn->arch_cond, 1000000));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "log archive server error");
+ }
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ return (NULL);
+}
+
+/*
+ * __wt_logmgr_create --
+ * Start the log subsystem and archive server thread.
+ */
+int
+__wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int run;
+
+ conn = S2C(session);
+
+ /* Handle configuration. */
+ WT_RET(__logmgr_config(session, cfg, &run));
+
+ /* If logging is not configured, we're done. */
+ if (!run)
+ return (0);
+
+ conn->logging = 1;
+ /*
+ * Logging is on, allocate the WT_LOG structure and open the log file.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_LOG), &conn->log));
+ log = conn->log;
+ WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
+ WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
+ WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
+ WT_RET(__wt_rwlock_alloc(session,
+ &log->log_archive_lock, "log archive lock"));
+ if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
+ log->allocsize =
+ WT_MAX((uint32_t)conn->buffer_alignment, LOG_ALIGN);
+ else
+ log->allocsize = LOG_ALIGN;
+ INIT_LSN(&log->alloc_lsn);
+ INIT_LSN(&log->ckpt_lsn);
+ INIT_LSN(&log->first_lsn);
+ INIT_LSN(&log->sync_lsn);
+ INIT_LSN(&log->trunc_lsn);
+ INIT_LSN(&log->write_lsn);
+ log->fileid = 0;
+ WT_RET(__wt_cond_alloc(session, "log sync", 0, &log->log_sync_cond));
+ WT_RET(__wt_log_open(session));
+ WT_RET(__wt_log_slot_init(session));
+
+ /* If archiving is not configured, we're done. */
+ if (!conn->archive)
+ return (0);
+
+ /*
+ * If an archive thread exists, the user may have reconfigured the
+ * archive thread. Signal the thread. Otherwise the user wants
+ * archiving and we need to start up the thread.
+ */
+ if (conn->arch_session != NULL) {
+ WT_ASSERT(session, conn->arch_cond != NULL);
+ WT_ASSERT(session, conn->arch_tid_set != 0);
+ WT_RET(__wt_cond_signal(session, conn->arch_cond));
+ } else {
+ /* The log archive server gets its own session. */
+ WT_RET(__wt_open_internal_session(
+ conn, "archive-server", 0, 0, &conn->arch_session));
+ WT_RET(__wt_cond_alloc(conn->arch_session,
+ "log archiving server", 0, &conn->arch_cond));
+
+ /*
+ * Start the thread.
+ */
+ WT_RET(__wt_thread_create(conn->arch_session,
+ &conn->arch_tid, __log_archive_server, conn->arch_session));
+ conn->arch_tid_set = 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_logmgr_destroy --
+ * Destroy the log archiving server thread and logging subsystem.
+ */
+int
+__wt_logmgr_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ if (!conn->logging)
+ return (0);
+ if (conn->arch_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->arch_cond));
+ WT_TRET(__wt_thread_join(session, conn->arch_tid));
+ conn->arch_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->arch_cond));
+
+ WT_TRET(__wt_log_close(session));
+
+ __wt_free(session, conn->log_path);
+
+ /* Close the server thread's session. */
+ if (conn->arch_session != NULL) {
+ wt_session = &conn->arch_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ conn->arch_session = NULL;
+ }
+
+ WT_TRET(__wt_log_slot_destroy(session));
+ WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
+ WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
+ __wt_spin_destroy(session, &conn->log->log_lock);
+ __wt_spin_destroy(session, &conn->log->log_slot_lock);
+ __wt_spin_destroy(session, &conn->log->log_sync_lock);
+ __wt_free(session, conn->log);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
new file mode 100644
index 00000000000..41fc9809521
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_connection_open --
+ * Open a connection.
+ */
+int
+__wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
+{
+ WT_SESSION_IMPL *session;
+
+ /* Default session. */
+ session = conn->default_session;
+ WT_ASSERT(session, session->iface.connection == &conn->iface);
+
+ /*
+ * Tell internal server threads to run: this must be set before opening
+ * any sessions.
+ */
+ F_SET(conn, WT_CONN_SERVER_RUN);
+
+ /* WT_SESSION_IMPL array. */
+ WT_RET(__wt_calloc(session,
+ conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
+
+ /*
+ * Open the default session. We open this before starting service
+ * threads because those may allocate and use session resources that
+ * need to get cleaned up on close.
+ */
+ WT_RET(__wt_open_internal_session(conn, "connection", 1, 0, &session));
+
+ /*
+ * The connection's default session is originally a static structure,
+ * swap that out for a more fully-functional session. It's necessary
+ * to have this step: the session allocation code uses the connection's
+ * session, and if we pass a reference to the default session as the
+ * place to store the allocated session, things get confused and error
+ * handling can be corrupted. So, we allocate into a stack variable
+ * and then assign it on success.
+ */
+ conn->default_session = session;
+
+ /*
+ * Publish: there must be a barrier to ensure the connection structure
+ * fields are set before other threads read from the pointer.
+ */
+ WT_WRITE_BARRIER();
+
+ /* Connect to a cache pool. */
+ WT_RET(__wt_cache_pool_config(session, cfg));
+
+ /* Create the cache. */
+ WT_RET(__wt_cache_create(session, cfg));
+
+ /* Initialize transaction support. */
+ WT_RET(__wt_txn_global_init(session, cfg));
+
+ return (0);
+}
+
+/*
+ * __wt_connection_close --
+ * Close a connection handle.
+ */
+int
+__wt_connection_close(WT_CONNECTION_IMPL *conn)
+{
+ WT_CONNECTION *wt_conn;
+ WT_DECL_RET;
+ WT_DLH *dlh;
+ WT_FH *fh;
+ WT_SESSION_IMPL *s, *session;
+ WT_TXN_GLOBAL *txn_global;
+ u_int i;
+
+ wt_conn = &conn->iface;
+ txn_global = &conn->txn_global;
+ session = conn->default_session;
+
+ /*
+ * We're shutting down. Make sure everything gets freed.
+ *
+ * It's possible that the eviction server is in the middle of a long
+ * operation, with a transaction ID pinned. In that case, we will loop
+ * here until the transaction ID is released, when the oldest
+ * transaction ID will catch up with the current ID.
+ */
+ for (;;) {
+ __wt_txn_update_oldest(session);
+ if (txn_global->oldest_id == txn_global->current)
+ break;
+ __wt_yield();
+ }
+
+ /* Clear any pending async ops. */
+ WT_TRET(__wt_async_flush(session));
+
+ /*
+ * Shut down server threads other than the eviction server, which is
+ * needed later to close btree handles. Some of these threads access
+ * btree handles, so take care in ordering shutdown to make sure they
+ * exit before files are closed.
+ */
+ F_CLR(conn, WT_CONN_SERVER_RUN);
+ WT_TRET(__wt_async_destroy(session));
+ WT_TRET(__wt_lsm_manager_destroy(session));
+ WT_TRET(__wt_checkpoint_server_destroy(session));
+ WT_TRET(__wt_statlog_destroy(session, 1));
+ WT_TRET(__wt_sweep_destroy(session));
+
+ /* Close open data handles. */
+ WT_TRET(__wt_conn_dhandle_discard(session));
+
+ /*
+ * Now that all data handles are closed, tell logging that a checkpoint
+ * has completed then shut down the log manager (only after closing
+ * data handles).
+ */
+ if (conn->logging) {
+ WT_TRET(__wt_txn_checkpoint_log(
+ session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
+ WT_TRET(__wt_logmgr_destroy(session));
+ }
+
+ /* Free memory for collators, compressors, data sources. */
+ WT_TRET(__wt_conn_remove_collator(session));
+ WT_TRET(__wt_conn_remove_compressor(session));
+ WT_TRET(__wt_conn_remove_data_source(session));
+
+ /*
+ * Complain if files weren't closed, ignoring the lock file, we'll
+ * close it in a minute.
+ */
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
+ if (fh == conn->lock_fh)
+ continue;
+
+ __wt_errx(session,
+ "Connection has open file handles: %s", fh->name);
+ WT_TRET(__wt_close(session, fh));
+ fh = TAILQ_FIRST(&conn->fhqh);
+ }
+
+ /* Shut down the eviction server thread. */
+ WT_TRET(__wt_evict_destroy(session));
+
+ /* Disconnect from shared cache - must be before cache destroy. */
+ WT_TRET(__wt_conn_cache_pool_destroy(session));
+
+ /* Discard the cache. */
+ WT_TRET(__wt_cache_destroy(session));
+
+ /* Discard transaction state. */
+ __wt_txn_global_destroy(session);
+
+ /* Close extensions, first calling any unload entry point. */
+ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
+ TAILQ_REMOVE(&conn->dlhqh, dlh, q);
+
+ if (dlh->terminate != NULL)
+ WT_TRET(dlh->terminate(wt_conn));
+ WT_TRET(__wt_dlclose(session, dlh));
+ }
+
+ /*
+ * Close the internal (default) session, and switch back to the dummy
+ * session in case of any error messages from the remaining operations
+ * while destroying the connection handle.
+ */
+ if (session != &conn->dummy_session) {
+ WT_TRET(session->iface.close(&session->iface, NULL));
+ session = conn->default_session = &conn->dummy_session;
+ }
+
+ /*
+ * The session's split stash isn't discarded during normal session close
+ * because it may persist past the life of the session. Discard it now.
+ */
+ if ((s = conn->sessions) != NULL)
+ for (i = 0; i < conn->session_size; ++s, ++i)
+ __wt_split_stash_discard_all(session, s);
+
+ /*
+ * The session's hazard pointer memory isn't discarded during normal
+ * session close because access to it isn't serialized. Discard it
+ * now.
+ */
+ if ((s = conn->sessions) != NULL)
+ for (i = 0; i < conn->session_size; ++s, ++i)
+ if (s != session)
+ __wt_free(session, s->hazard);
+
+ /* Destroy the handle. */
+ WT_TRET(__wt_connection_destroy(conn));
+
+ return (ret);
+}
+
+/*
+ * __wt_connection_workers --
+ * Start the worker threads.
+ */
+int
+__wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ /*
+ * Start the eviction thread.
+ */
+ WT_RET(__wt_evict_create(session));
+
+ /*
+ * Start the handle sweep thread.
+ */
+ WT_RET(__wt_sweep_create(session));
+
+ /*
+ * Start the optional statistics thread. Start statistics first so that
+ * other optional threads can know if statistics are enabled or not.
+ */
+ WT_RET(__wt_statlog_create(session, cfg));
+
+ /* Start the optional async threads. */
+ WT_RET(__wt_async_create(session, cfg));
+
+ /*
+ * Start the optional logging/archive thread.
+ * NOTE: The log manager must be started before checkpoints so that the
+ * checkpoint server knows if logging is enabled.
+ */
+ WT_RET(__wt_logmgr_create(session, cfg));
+
+ /* Start the optional checkpoint thread. */
+ WT_RET(__wt_checkpoint_server_create(session, cfg));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_stat.c b/src/third_party/wiredtiger/src/conn/conn_stat.c
new file mode 100644
index 00000000000..f7229504898
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_stat.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef __GNUC__
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1)
+/*
+ * !!!
+ * GCC with -Wformat-nonliteral complains about calls to strftime in this file.
+ * There's nothing wrong, this makes the warning go away.
+ */
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#endif
+#endif
+
+/*
+ * __stat_sources_free --
+ * Free the array of statistics sources.
+ */
+static void
+__stat_sources_free(WT_SESSION_IMPL *session, char ***sources)
+{
+ char **p;
+
+ if ((p = (*sources)) != NULL) {
+ for (; *p != NULL; ++p)
+ __wt_free(session, *p);
+ __wt_free(session, *sources);
+ }
+}
+
+/*
+ * __wt_conn_stat_init --
+ * Initialize the per-connection statistics.
+ */
+void
+__wt_conn_stat_init(WT_SESSION_IMPL *session)
+{
+ __wt_async_stats_update(session);
+ __wt_cache_stats_update(session);
+ __wt_txn_stats_update(session);
+}
+
+/*
+ * __statlog_config --
+ * Parse and setup the statistics server options.
+ */
+static int
+__statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp)
+{
+ WT_CONFIG objectconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ int cnt;
+ char **sources;
+
+ conn = S2C(session);
+ sources = NULL;
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval));
+ /* Only start the server if wait time is non-zero */
+ *runp = (cval.val == 0) ? 0 : 1;
+ conn->stat_usecs = (long)cval.val * 1000000;
+
+ WT_RET(__wt_config_gets(
+ session, cfg, "statistics_log.on_close", &cval));
+ if (cval.val != 0)
+ FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE);
+
+ /*
+ * Statistics logging configuration requires either a wait time or an
+ * on-close setting.
+ */
+ if (*runp == 0 && !FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE))
+ return (0);
+
+ WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval));
+ WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+ for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt)
+ ;
+ WT_RET_NOTFOUND_OK(ret);
+ if (cnt != 0) {
+ WT_RET(__wt_calloc_def(session, cnt + 1, &sources));
+ WT_RET(__wt_config_subinit(session, &objectconf, &cval));
+ for (cnt = 0;
+ (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) {
+ /*
+ * XXX
+ * Only allow "file:" and "lsm:" for now: "file:" works
+ * because it's been converted to data handles, "lsm:"
+ * works because we can easily walk the list of open LSM
+ * objects, even though it hasn't been converted.
+ */
+ if (!WT_PREFIX_MATCH(k.str, "file:") &&
+ !WT_PREFIX_MATCH(k.str, "lsm:"))
+ WT_ERR_MSG(session, EINVAL,
+ "statistics_log sources configuration only "
+ "supports objects of type \"file\" or "
+ "\"lsm\"");
+ WT_ERR(
+ __wt_strndup(session, k.str, k.len, &sources[cnt]));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ conn->stat_sources = sources;
+ sources = NULL;
+ }
+
+ WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval));
+ WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path));
+
+ WT_ERR(__wt_config_gets(
+ session, cfg, "statistics_log.timestamp", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format));
+
+err: __stat_sources_free(session, &sources);
+ return (ret);
+}
+
+/*
+ * __statlog_dump --
+ * Dump out handle/connection statistics.
+ */
+static int
+__statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_STATS *stats;
+ u_int i;
+ uint64_t max;
+ const char *uri;
+ const char *cfg[] = {
+ WT_CONFIG_BASE(session, session_open_cursor), NULL };
+
+ conn = S2C(session);
+
+ /* Build URI and configuration string. */
+ if (conn_stats)
+ uri = "statistics:";
+ else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "statistics:%s", name));
+ uri = tmp->data;
+ }
+
+ /*
+ * Open the statistics cursor and dump the statistics.
+ *
+ * If we don't find an underlying object, silently ignore it, the object
+ * may exist only intermittently.
+ */
+ switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) {
+ case 0:
+ max = conn_stats ?
+ sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) :
+ sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+ for (i = 0,
+ stats = WT_CURSOR_STATS(cursor); i < max; ++i, ++stats)
+ WT_ERR_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s %s\n",
+ conn->stat_stamp,
+ stats->v, name, stats->desc) < 0), __wt_errno());
+ WT_ERR(cursor->close(cursor));
+ break;
+ case EBUSY:
+ case ENOENT:
+ case WT_NOTFOUND:
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __statlog_apply --
+ * Review a single open handle and dump statistics on demand.
+ */
+static int
+__statlog_apply(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DATA_HANDLE *dhandle;
+ char **p;
+
+ WT_UNUSED(cfg);
+
+ dhandle = session->dhandle;
+
+ /* Check for a match on the set of sources. */
+ for (p = S2C(session)->stat_sources; *p != NULL; ++p)
+ if (WT_PREFIX_MATCH(dhandle->name, *p))
+ return (__statlog_dump(session, dhandle->name, 0));
+ return (0);
+}
+
+/*
+ * __statlog_lsm_apply --
+ * Review the list open LSM trees, and dump statistics on demand.
+ *
+ * XXX
+ * This code should be removed when LSM objects are converted to data handles.
+ */
+static int
+__statlog_lsm_apply(WT_SESSION_IMPL *session)
+{
+#define WT_LSM_TREE_LIST_SLOTS 100
+ WT_LSM_TREE *lsm_tree, *list[WT_LSM_TREE_LIST_SLOTS];
+ WT_DECL_RET;
+ int cnt, locked;
+ char **p;
+
+ cnt = locked = 0;
+
+ /*
+ * Walk the list of LSM trees, checking for a match on the set of
+ * sources.
+ *
+ * XXX
+ * We can't hold the schema lock for the traversal because the LSM
+ * statistics code acquires the tree lock, and the LSM cursor code
+ * acquires the tree lock and then acquires the schema lock, it's a
+ * classic deadlock. This is temporary code so I'm not going to do
+ * anything fancy.
+ * It is OK to not keep holding the schema lock after populating
+ * the list of matching LSM trees, since the __wt_lsm_tree_get call
+ * will bump a reference count, so the tree won't go away.
+ */
+ __wt_spin_lock(session, &S2C(session)->schema_lock);
+ locked = 1;
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
+ if (cnt == WT_LSM_TREE_LIST_SLOTS)
+ break;
+ for (p = S2C(session)->stat_sources; *p != NULL; ++p)
+ if (WT_PREFIX_MATCH(lsm_tree->name, *p)) {
+ WT_ERR(__wt_lsm_tree_get(
+ session, lsm_tree->name, 0, &list[cnt++]));
+ break;
+ }
+ }
+ __wt_spin_unlock(session, &S2C(session)->schema_lock);
+ locked = 0;
+
+ while (cnt > 0) {
+ --cnt;
+ WT_TRET(__statlog_dump(session, list[cnt]->name, 0));
+ __wt_lsm_tree_release(session, list[cnt]);
+ }
+
+err: if (locked)
+ __wt_spin_unlock(session, &S2C(session)->schema_lock);
+ /* Release any LSM trees on error. */
+ while (cnt > 0) {
+ --cnt;
+ __wt_lsm_tree_release(session, list[cnt]);
+ }
+ return (ret);
+}
+
+/*
+ * __statlog_log_one --
+ * Output a set of statistics into the current log file.
+ */
+static int
+__statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
+{
+ FILE *log_file;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ struct timespec ts;
+ struct tm *tm, _tm;
+
+ conn = S2C(session);
+
+ /* Get the current local time of day. */
+ WT_RET(__wt_epoch(session, &ts));
+ tm = localtime_r(&ts.tv_sec, &_tm);
+
+ /* Create the logging path name for this time of day. */
+ if (strftime(tmp->mem, tmp->memsize, conn->stat_path, tm) == 0)
+ WT_RET_MSG(session, ENOMEM, "strftime path conversion");
+
+ /* If the path has changed, cycle the log file. */
+ if ((log_file = conn->stat_fp) == NULL ||
+ path == NULL || strcmp(tmp->mem, path->mem) != 0) {
+ conn->stat_fp = NULL;
+ if (log_file != NULL)
+ WT_RET(fclose(log_file) == 0 ? 0 : __wt_errno());
+
+ if (path != NULL)
+ (void)strcpy(path->mem, tmp->mem);
+ WT_RET_TEST((log_file =
+ fopen(tmp->mem, "a")) == NULL, __wt_errno());
+ }
+ conn->stat_fp = log_file;
+
+ /* Create the entry prefix for this time of day. */
+ if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0)
+ WT_RET_MSG(session, ENOMEM, "strftime timestamp conversion");
+ conn->stat_stamp = tmp->mem;
+
+ /* Dump the connection statistics. */
+ WT_RET(__statlog_dump(session, conn->home, 1));
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ /* Dump the spinlock statistics. */
+ WT_RET(__wt_statlog_dump_spinlock(conn, conn->home));
+#endif
+
+ /*
+ * Lock the schema and walk the list of open handles, dumping
+ * any that match the list of object sources.
+ */
+ if (conn->stat_sources != NULL) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_conn_btree_apply(session, 0, __statlog_apply, NULL));
+ WT_RET(ret);
+ }
+
+ /*
+ * Walk the list of open LSM trees, dumping any that match the
+ * the list of object sources.
+ *
+ * XXX
+ * This code should be removed when LSM objects are converted to
+ * data handles.
+ */
+ if (conn->stat_sources != NULL)
+ WT_RET(__statlog_lsm_apply(session));
+
+ /* Flush. */
+ WT_RET(fflush(conn->stat_fp) == 0 ? 0 : __wt_errno());
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_log_one --
+ * Log a set of statistics into the configured statistics log. Requires
+ * that the server is not currently running.
+ */
+int
+__wt_statlog_log_one(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_DECL_ITEM(tmp);
+
+ conn = S2C(session);
+
+ if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE))
+ return (0);
+
+ if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_STATISTICS))
+ WT_RET_MSG(session, EINVAL,
+ "Attempt to log statistics while a server is running");
+
+ WT_RET(__wt_scr_alloc(session, strlen(conn->stat_path) + 128, &tmp));
+ WT_ERR(__statlog_log_one(session, NULL, tmp));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __statlog_server --
+ * The statistics server thread.
+ */
+static void *
+__statlog_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_ITEM path, tmp;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+
+ WT_CLEAR(path);
+ WT_CLEAR(tmp);
+
+ /*
+ * We need a temporary place to build a path and an entry prefix.
+ * The length of the path plus 128 should be more than enough.
+ *
+ * We also need a place to store the current path, because that's
+ * how we know when to close/re-open the file.
+ */
+ WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128));
+ WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128));
+
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) {
+ if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE))
+ WT_ERR(__statlog_log_one(session, &path, &tmp));
+
+ /* Wait until the next event. */
+ WT_ERR(
+ __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "statistics log server error");
+ }
+ __wt_buf_free(session, &path);
+ __wt_buf_free(session, &tmp);
+ return (NULL);
+}
+
+/*
+ * __statlog_start --
+ * Start the statistics server thread.
+ */
+static int
+__statlog_start(WT_CONNECTION_IMPL *conn)
+{
+ WT_SESSION_IMPL *session;
+
+ /* Nothing to do if the server is already running. */
+ if (conn->stat_session != NULL)
+ return (0);
+
+ F_SET(conn, WT_CONN_SERVER_STATISTICS);
+ /* The statistics log server gets its own session. */
+ WT_RET(__wt_open_internal_session(
+ conn, "statlog-server", 1, 1, &conn->stat_session));
+ session = conn->stat_session;
+
+ WT_RET(__wt_cond_alloc(
+ session, "statistics log server", 0, &conn->stat_cond));
+
+ /*
+ * Start the thread.
+ *
+ * Statistics logging creates a thread per database, rather than using
+ * a single thread to do logging for all of the databases. If we ever
+ * see lots of databases at a time, doing statistics logging, and we
+ * want to reduce the number of threads, there's no reason we have to
+ * have more than one thread, I just didn't feel like writing the code
+ * to figure out the scheduling.
+ */
+ WT_RET(__wt_thread_create(
+ session, &conn->stat_tid, __statlog_server, session));
+ conn->stat_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_create --
+ * Start the statistics server thread.
+ */
+int
+__wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ int start;
+
+ conn = S2C(session);
+ start = 0;
+
+ /*
+ * Stop any server that is already running. This means that each time
+ * reconfigure is called we'll bounce the server even if there are no
+ * configuration changes - but that makes our lives easier.
+ */
+ if (conn->stat_session != NULL)
+ WT_RET(__wt_statlog_destroy(session, 0));
+
+ WT_RET(__statlog_config(session, cfg, &start));
+ if (start)
+ WT_RET(__statlog_start(conn));
+
+ return (0);
+}
+
+/*
+ * __wt_statlog_destroy --
+ * Destroy the statistics server thread.
+ */
+int
+__wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ F_CLR(conn, WT_CONN_SERVER_STATISTICS);
+ if (conn->stat_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->stat_cond));
+ WT_TRET(__wt_thread_join(session, conn->stat_tid));
+ conn->stat_tid_set = 0;
+ }
+
+ /* Log a set of statistics on shutdown if configured. */
+ if (is_close)
+ WT_TRET(__wt_statlog_log_one(session));
+
+ WT_TRET(__wt_cond_destroy(session, &conn->stat_cond));
+
+ __stat_sources_free(session, &conn->stat_sources);
+ __wt_free(session, conn->stat_path);
+ __wt_free(session, conn->stat_format);
+
+ /* Close the server thread's session. */
+ if (conn->stat_session != NULL) {
+ wt_session = &conn->stat_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+
+ /* Clear connection settings so reconfigure is reliable. */
+ conn->stat_session = NULL;
+ conn->stat_tid_set = 0;
+ conn->stat_format = NULL;
+ if (conn->stat_fp != NULL) {
+ WT_TRET(fclose(conn->stat_fp) == 0 ? 0 : __wt_errno());
+ conn->stat_fp = NULL;
+ }
+ conn->stat_path = NULL;
+ conn->stat_sources = NULL;
+ conn->stat_stamp = NULL;
+ conn->stat_usecs = 0;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/conn/conn_sweep.c b/src/third_party/wiredtiger/src/conn/conn_sweep.c
new file mode 100644
index 00000000000..3bccc5814be
--- /dev/null
+++ b/src/third_party/wiredtiger/src/conn/conn_sweep.c
@@ -0,0 +1,187 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __sweep --
+ * Close unused dhandles on the connection dhandle list.
+ */
+static int
+__sweep(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle, *dhandle_next;
+ WT_DECL_RET;
+ time_t now;
+
+ conn = S2C(session);
+
+ /*
+ * Session's cache handles unless the session itself is closed, at which
+ * time the handle reference counts are immediately decremented. Don't
+ * discard handles that have been open recently.
+ */
+ WT_RET(__wt_seconds(session, &now));
+
+ dhandle = SLIST_FIRST(&conn->dhlh);
+ for (; dhandle != NULL; dhandle = dhandle_next) {
+ dhandle_next = SLIST_NEXT(dhandle, l);
+ if (dhandle->session_ref != 0 ||
+ now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT)
+ continue;
+
+ /*
+ * We have a candidate for closing; if it's open, flush dirty
+ * leaf pages, then acquire an exclusive lock on the handle
+ * and close it. We might be blocking opens for a long time
+ * (over disk I/O), but the handle was quiescent for awhile.
+ *
+ * The close can fail if an update cannot be written (updates in
+ * a no-longer-referenced file might not yet be globally visible
+ * if sessions have disjoint sets of files open). If the handle
+ * is busy, skip it, we'll retry the close the next time, after
+ * the transaction state has progressed.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) {
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_cache_op(
+ session, NULL, WT_SYNC_WRITE_LEAVES));
+ WT_RET(ret);
+
+ /*
+ * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we
+ * want opens to block on us rather than returning an
+ * EBUSY error to the application.
+ */
+ ret = __wt_try_writelock(session, dhandle->rwlock);
+ if (ret == EBUSY) {
+ ret = 0;
+ continue;
+ }
+ WT_RET(ret);
+
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_conn_btree_sync_and_close(session, 0));
+ if (ret == EBUSY)
+ ret = 0;
+
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ WT_RET(ret);
+ }
+
+ /*
+ * Attempt to discard the handle (the called function checks the
+ * handle-open flag after acquiring appropriate locks, which is
+ * why we don't do any special handling of EBUSY returns above,
+ * that path never cleared the handle-open flag.
+ */
+ ret = __wt_conn_dhandle_discard_single(session, dhandle, 0);
+ if (ret == EBUSY)
+ ret = 0;
+ WT_RET(ret);
+ }
+ return (0);
+}
+
+/*
+ * __sweep_server --
+ * The handle sweep server thread.
+ */
+static void *
+__sweep_server(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = arg;
+ conn = S2C(session);
+
+ /*
+ * Sweep for dead handles.
+ */
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(conn, WT_CONN_SERVER_SWEEP)) {
+
+ /* Wait until the next event. */
+ WT_ERR(
+ __wt_cond_wait(session, conn->sweep_cond, 30 * WT_MILLION));
+
+ /* Sweep the handles. */
+ WT_ERR(__sweep(session));
+ }
+
+ if (0) {
+err: __wt_err(session, ret, "handle sweep server error");
+ }
+ return (NULL);
+}
+
+/*
+ * __wt_sweep_create --
+ * Start the handle sweep thread.
+ */
+int
+__wt_sweep_create(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* Set first, the thread might run before we finish up. */
+ F_SET(conn, WT_CONN_SERVER_SWEEP);
+
+ WT_RET(__wt_open_internal_session(
+ conn, "sweep-server", 1, 1, &conn->sweep_session));
+ session = conn->sweep_session;
+
+ /*
+ * Handle sweep does enough I/O it may be called upon to perform slow
+ * operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT);
+
+ WT_RET(__wt_cond_alloc(
+ session, "handle sweep server", 0, &conn->sweep_cond));
+
+ WT_RET(__wt_thread_create(
+ session, &conn->sweep_tid, __sweep_server, session));
+ conn->sweep_tid_set = 1;
+
+ return (0);
+}
+
+/*
+ * __wt_sweep_destroy --
+ * Destroy the handle-sweep thread.
+ */
+int
+__wt_sweep_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ F_CLR(conn, WT_CONN_SERVER_SWEEP);
+ if (conn->sweep_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->sweep_cond));
+ WT_TRET(__wt_thread_join(session, conn->sweep_tid));
+ conn->sweep_tid_set = 0;
+ }
+ WT_TRET(__wt_cond_destroy(session, &conn->sweep_cond));
+
+ if (conn->sweep_session != NULL) {
+ wt_session = &conn->sweep_session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+
+ conn->sweep_session = NULL;
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_backup.c b/src/third_party/wiredtiger/src/cursor/cur_backup.c
new file mode 100644
index 00000000000..85a85521213
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_backup.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __backup_all(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
+static int __backup_file_remove(WT_SESSION_IMPL *);
+static int __backup_list_all_append(WT_SESSION_IMPL *, const char *[]);
+static int __backup_list_append(
+ WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *);
+static int __backup_start(
+ WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]);
+static int __backup_stop(WT_SESSION_IMPL *);
+static int __backup_uri(
+ WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[], int *);
+
+/*
+ * __curbackup_next --
+ * WT_CURSOR->next method for the backup cursor type.
+ */
+static int
+__curbackup_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cb = (WT_CURSOR_BACKUP *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ if (cb->list == NULL || cb->list[cb->next].name == NULL) {
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+ WT_ERR(WT_NOTFOUND);
+ }
+
+ cb->iface.key.data = cb->list[cb->next].name;
+ cb->iface.key.size = strlen(cb->list[cb->next].name) + 1;
+ ++cb->next;
+
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbackup_reset --
+ * WT_CURSOR->reset method for the backup cursor type.
+ */
+static int
+__curbackup_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cb = (WT_CURSOR_BACKUP *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ cb->next = 0;
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbackup_close --
+ * WT_CURSOR->close method for the backup cursor type.
+ */
+static int
+__curbackup_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int tret;
+
+ cb = (WT_CURSOR_BACKUP *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ WT_TRET(__backup_cleanup_handles(session, cb));
+ WT_TRET(__wt_cursor_close(cursor));
+ session->bkp_cursor = NULL;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ tret = __backup_stop(session)); /* Stop the backup. */
+ WT_TRET(tret);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curbackup_open --
+ * WT_SESSION->open_cursor method for the backup cursor type.
+ */
+int
+__wt_curbackup_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_notsup, /* get-value */
+ __wt_cursor_notsup, /* set-key */
+ __wt_cursor_notsup, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curbackup_next, /* next */
+ __wt_cursor_notsup, /* prev */
+ __curbackup_reset, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curbackup_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_BACKUP *cb;
+ WT_DECL_RET;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0);
+
+ cb = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &cb));
+ cursor = &cb->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ session->bkp_cursor = cb;
+
+ cursor->key_format = "S"; /* Return the file names as the key. */
+ cursor->value_format = ""; /* No value. */
+
+ /*
+ * Start the backup and fill in the cursor's list. Acquire the schema
+ * lock, we need a consistent view when creating a copy.
+ */
+ WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg));
+ WT_ERR(ret);
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ if (0) {
+err: __wt_free(session, cb);
+ }
+
+ return (ret);
+}
+
+/*
+ * __backup_start --
+ * Start a backup.
+ */
+static int
+__backup_start(
+ WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ u_int i, logcount;
+ int exist, target_list;
+ char **logfiles;
+
+ conn = S2C(session);
+
+ cb->next = 0;
+ cb->list = NULL;
+ logfiles = NULL;
+ logcount = 0;
+
+ /*
+ * Single thread hot backups: we're holding the schema lock, so we
+ * know we'll serialize with other attempts to start a hot backup.
+ */
+ if (conn->hot_backup)
+ WT_RET_MSG(
+ session, EINVAL, "there is already a backup cursor open");
+
+ /*
+ * The hot backup copy is done outside of WiredTiger, which means file
+ * blocks can't be freed and re-allocated until the backup completes.
+ * The checkpoint code checks the backup flag, and if a backup cursor
+ * is open checkpoints aren't discarded. We release the lock as soon
+ * as we've set the flag, we don't want to block checkpoints, we just
+ * want to make sure no checkpoints are deleted. The checkpoint code
+ * holds the lock until it's finished the checkpoint, otherwise we
+ * could start a hot backup that would race with an already-started
+ * checkpoint.
+ */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ conn->hot_backup = 1;
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+
+ /* Create the hot backup file. */
+ WT_ERR(__backup_file_create(session, cb));
+
+ /* Add log files if logging is enabled. */
+
+ /*
+ * If a list of targets was specified, work our way through them.
+ * Else, generate a list of all database objects.
+ *
+ * Include log files if doing a full backup, and copy them before
+ * copying data files to avoid rolling the metadata forward across
+ * a checkpoint that completes during the backup.
+ */
+ target_list = 0;
+ WT_ERR(__backup_uri(session, cb, cfg, &target_list));
+ if (!target_list) {
+ if (conn->log) {
+ WT_ERR(__wt_log_get_active_files(
+ session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++)
+ WT_ERR(__backup_list_append(
+ session, cb, logfiles[i]));
+ }
+
+ WT_ERR(__backup_all(session, cb));
+ }
+
+ /* Add the hot backup and standard WiredTiger files to the list. */
+ WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
+ WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
+ if (exist)
+ WT_ERR(__backup_list_append(session, cb, WT_BASECONFIG));
+ WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
+ if (exist)
+ WT_ERR(__backup_list_append(session, cb, WT_USERCONFIG));
+ WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
+
+err: /* Close the hot backup file. */
+ if (cb->bfp != NULL) {
+ WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno());
+ cb->bfp = NULL;
+ }
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+
+ if (ret != 0) {
+ WT_TRET(__backup_cleanup_handles(session, cb));
+ WT_TRET(__backup_stop(session));
+ }
+
+ return (ret);
+}
+
+/*
+ * __backup_cleanup_handles --
+ * Release and free all btree handles held by the backup. This is kept
+ * separate from __backup_stop because it can be called without the
+ * schema lock held.
+ */
+static int
+__backup_cleanup_handles(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+ WT_CURSOR_BACKUP_ENTRY *p;
+ WT_DECL_RET;
+
+ if (cb->list == NULL)
+ return (0);
+
+ /* Release the handles, free the file names, free the list itself. */
+ for (p = cb->list; p->name != NULL; ++p) {
+ if (p->handle != NULL)
+ WT_WITH_DHANDLE(session, p->handle,
+ WT_TRET(__wt_session_release_btree(session)));
+ __wt_free(session, p->name);
+ }
+
+ __wt_free(session, cb->list);
+ return (ret);
+}
+
+/*
+ * __backup_stop --
+ * Stop a backup.
+ */
+static int
+__backup_stop(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ /* Remove any backup metadata file. */
+ ret = __backup_file_remove(session);
+
+ /* Checkpoint deletion can proceed, as can the next hot backup. */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ conn->hot_backup = 0;
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+
+ return (ret);
+}
+
+/*
+ * __backup_all --
+ * Backup all objects in the database.
+ */
+static int
+__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *key, *value;
+
+ cursor = NULL;
+
+ /*
+ * Open a cursor on the metadata file and copy all of the entries to
+ * the hot backup file.
+ */
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &key));
+ WT_ERR(cursor->get_value(cursor, &value));
+ WT_ERR_TEST((fprintf(
+ cb->bfp, "%s\n%s\n", key, value) < 0), __wt_errno());
+
+ /*
+ * While reading the metadata file, check there are no "sources"
+ * or "types" which can't support hot backup. This checks for
+ * a data source that's non-standard, which can't be backed up,
+ * but is also sanity checking: if there's an entry backed by
+ * anything other than a file or lsm entry, we're confused.
+ */
+ if ((ret = __wt_config_getones(
+ session, value, "type", &cval)) == 0 &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm"))
+ WT_ERR_MSG(session, ENOTSUP,
+ "hot backup is not supported for objects of "
+ "type %.*s", (int)cval.len, cval.str);
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret =__wt_config_getones(
+ session, value, "source", &cval)) == 0 &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") &&
+ !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:"))
+ WT_ERR_MSG(session, ENOTSUP,
+ "hot backup is not supported for objects of "
+ "source %.*s", (int)cval.len, cval.str);
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* Build a list of the file objects that need to be copied. */
+ WT_ERR(__wt_meta_btree_apply(session, __backup_list_all_append, NULL));
+
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __backup_uri --
+ * Backup a list of objects.
+ */
+static int
+__backup_uri(WT_SESSION_IMPL *session,
+ WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp)
+{
+ WT_CONFIG targetconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ int target_list;
+ const char *uri;
+
+ *foundp = target_list = 0;
+
+ /*
+ * If we find a non-empty target configuration string, we have a job,
+ * otherwise it's not our problem.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "target", &cval));
+ WT_RET(__wt_config_subinit(session, &targetconf, &cval));
+ for (cb->list_next = 0;
+ (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) {
+ if (!target_list) {
+ target_list = *foundp = 1;
+
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ }
+
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+ uri = tmp->data;
+ if (v.len != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "%s: invalid backup target: URIs may need quoting",
+ uri);
+
+ WT_ERR(__wt_schema_worker(
+ session, uri, NULL, __wt_backup_list_uri_append, cfg, 0));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __backup_file_create --
+ * Create the meta-data backup file.
+ */
+static int
+__backup_file_create(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
+{
+ WT_DECL_RET;
+ char *path;
+
+ /* Open the hot backup file. */
+ WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path));
+ WT_ERR_TEST((cb->bfp = fopen(path, "w")) == NULL, __wt_errno());
+
+err: __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __backup_file_remove --
+ * Remove the meta-data backup file.
+ */
+static int
+__backup_file_remove(WT_SESSION_IMPL *session)
+{
+ return (__wt_remove(session, WT_METADATA_BACKUP));
+}
+
+/*
+ * __wt_backup_list_uri_append --
+ * Append a new file name to the list, allocate space as necessary.
+ * Called via the schema_worker function.
+ */
+int
+__wt_backup_list_uri_append(
+ WT_SESSION_IMPL *session, const char *name, int *skip)
+{
+ WT_CURSOR_BACKUP *cb;
+ const char *value;
+
+ cb = session->bkp_cursor;
+ WT_UNUSED(skip);
+
+ /* Add the metadata entry to the backup file. */
+ WT_RET(__wt_metadata_search(session, name, &value));
+ WT_RET_TEST(
+ (fprintf(cb->bfp, "%s\n%s\n", name, value) < 0), __wt_errno());
+ __wt_free(session, value);
+
+ /* Add file type objects to the list of files to be copied. */
+ if (WT_PREFIX_MATCH(name, "file:"))
+ WT_RET(__backup_list_append(session, cb, name));
+
+ return (0);
+}
+
+/*
+ * __backup_list_all_append --
+ * Append a new file name to the list, allocate space as necessary.
+ * Called via the __wt_meta_btree_apply function.
+ */
+static int
+__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CURSOR_BACKUP *cb;
+
+ WT_UNUSED(cfg);
+
+ cb = session->bkp_cursor;
+
+ /* Ignore files in the process of being bulk-loaded. */
+ if (F_ISSET(S2BT(session), WT_BTREE_BULK))
+ return (0);
+
+ /* Add the file to the list of files to be copied. */
+ return (__backup_list_append(session, cb, session->dhandle->name));
+}
+
+/*
+ * __backup_list_append --
+ * Append a new file name to the list, allocate space as necessary.
+ */
+static int
+__backup_list_append(
+ WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *uri)
+{
+ WT_CURSOR_BACKUP_ENTRY *p;
+ WT_DATA_HANDLE *old_dhandle;
+ WT_DECL_RET;
+ const char *name;
+ int need_handle;
+
+ /* Leave a NULL at the end to mark the end of the list. */
+ WT_RET(__wt_realloc_def(session, &cb->list_allocated,
+ cb->list_next + 2, &cb->list));
+ p = &cb->list[cb->list_next];
+ p[0].name = p[1].name = NULL;
+ p[0].handle = p[1].handle = NULL;
+
+ need_handle = 0;
+ name = uri;
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ need_handle = 1;
+ name += strlen("file:");
+ }
+
+ /*
+ * !!!
+ * Assumes metadata file entries map one-to-one to physical files.
+ * To support a block manager where that's not the case, we'd need
+ * to call into the block manager and get a list of physical files
+ * that map to this logical "file". I'm not going to worry about
+ * that for now, that block manager might not even support physical
+ * copying of files by applications.
+ */
+ WT_RET(__wt_strdup(session, name, &p->name));
+
+ /*
+ * If it's a file in the database, get a handle for the underlying
+ * object (this handle blocks schema level operations, for example
+ * WT_SESSION.drop or an LSM file discard after level merging).
+ */
+ if (need_handle) {
+ old_dhandle = session->dhandle;
+ if ((ret =
+ __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0)
+ p->handle = session->dhandle;
+ session->dhandle = old_dhandle;
+ WT_RET(ret);
+ }
+
+ ++cb->list_next;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_bulk.c b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
new file mode 100644
index 00000000000..96a45a7e629
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_bulk.c
@@ -0,0 +1,287 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curbulk_insert_fix --
+ * Fixed-length column-store bulk cursor insert.
+ */
+static int
+__curbulk_insert_fix(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_ERR(__wt_bulk_insert_fix(session, cbulk));
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_var --
+ * Variable-length column-store bulk cursor insert.
+ */
+static int
+__curbulk_insert_var(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int duplicate;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ /*
+ * If this isn't the first value inserted, compare it against the last
+ * value and increment the RLE count.
+ *
+ * Instead of a "first time" variable, I'm using the RLE count, because
+ * it is only zero before the first row is inserted.
+ */
+ duplicate = 0;
+ if (cbulk->rle != 0) {
+ if (cbulk->last.size == cursor->value.size &&
+ memcmp(cbulk->last.data, cursor->value.data,
+ cursor->value.size) == 0) {
+ ++cbulk->rle;
+ duplicate = 1;
+ } else
+ WT_ERR(__wt_bulk_insert_var(session, cbulk));
+ }
+
+ /*
+ * Save a copy of the value for the next comparison and reset the RLE
+ * counter.
+ */
+ if (!duplicate) {
+ WT_ERR(__wt_buf_set(session,
+ &cbulk->last, cursor->value.data, cursor->value.size));
+ cbulk->rle = 1;
+ }
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __bulk_row_keycmp_err --
+ * Error routine when keys inserted out-of-order.
+ */
+static int
+__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(a);
+ WT_DECL_ITEM(b);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
+ cursor = &cbulk->cbt.iface;
+
+ WT_ERR(__wt_scr_alloc(session, 512, &a));
+ WT_ERR(__wt_scr_alloc(session, 512, &b));
+
+ WT_ERR(__wt_buf_set_printable(
+ session, a, cursor->key.data, cursor->key.size));
+ WT_ERR(__wt_buf_set_printable(
+ session, b, cbulk->last.data, cbulk->last.size));
+
+ WT_ERR_MSG(session, EINVAL,
+ "bulk-load presented with out-of-order keys: %.*s compares smaller "
+ "than previously inserted key %.*s",
+ (int)a->size, (const char *)a->data,
+ (int)b->size, (const char *)b->data);
+
+err: __wt_scr_free(&a);
+ __wt_scr_free(&b);
+ return (ret);
+}
+
+/*
+ * __curbulk_insert_row --
+ * Row-store bulk cursor insert, with key-sort checks.
+ */
+static int
+__curbulk_insert_row(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int cmp;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_CHECKKEY(cursor);
+ WT_CURSOR_CHECKVALUE(cursor);
+
+ /*
+ * If this isn't the first key inserted, compare it against the last key
+ * to ensure the application doesn't accidentally corrupt the table.
+ *
+ * Instead of a "first time" variable, I'm using the RLE count, because
+ * it is only zero before the first row is inserted.
+ */
+ if (cbulk->rle != 0) {
+ WT_ERR(__wt_compare(session,
+ btree->collator, &cursor->key, &cbulk->last, &cmp));
+ if (cmp <= 0)
+ WT_ERR(__bulk_row_keycmp_err(cbulk));
+ }
+
+ /*
+ * Save a copy of the key for the next comparison and set the RLE
+ * counter.
+ */
+ WT_ERR(__wt_buf_set(session,
+ &cbulk->last, cursor->key.data, cursor->key.size));
+ cbulk->rle = 1;
+
+ WT_ERR(__wt_bulk_insert_row(session, cbulk));
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_insert_row_skip_check --
+ * Row-store bulk cursor insert, without key-sort checks.
+ */
+static int
+__curbulk_insert_row_skip_check(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ /*
+ * Bulk cursor inserts are updates, but don't need auto-commit
+ * transactions because they are single-threaded and not visible
+ * until the bulk cursor is closed.
+ */
+ CURSOR_API_CALL(cursor, session, insert, btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_ERR(__wt_bulk_insert_row(session, cbulk));
+
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curbulk_close --
+ * WT_CURSOR->close for the bulk cursor type.
+ */
+static int
+__curbulk_close(WT_CURSOR *cursor)
+{
+ WT_BTREE *btree;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbulk = (WT_CURSOR_BULK *)cursor;
+ btree = cbulk->cbt.btree;
+
+ CURSOR_API_CALL(cursor, session, close, btree);
+
+ WT_TRET(__wt_bulk_wrapup(session, cbulk));
+ __wt_buf_free(session, &cbulk->last);
+
+ WT_TRET(__wt_session_release_btree(session));
+
+ /* The URI is owned by the btree handle. */
+ cursor->internal_uri = NULL;
+
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curbulk_init --
+ * Initialize a bulk cursor.
+ */
+int
+__wt_curbulk_init(WT_SESSION_IMPL *session,
+ WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check)
+{
+ WT_CURSOR *c;
+ WT_CURSOR_BTREE *cbt;
+
+ c = &cbulk->cbt.iface;
+ cbt = &cbulk->cbt;
+
+ /* Bulk cursors only support insert and close (reset is a no-op). */
+ __wt_cursor_set_notsup(c);
+ switch (cbt->btree->type) {
+ case BTREE_COL_FIX:
+ c->insert = __curbulk_insert_fix;
+ break;
+ case BTREE_COL_VAR:
+ c->insert = __curbulk_insert_var;
+ break;
+ case BTREE_ROW:
+ c->insert = skip_sort_check ?
+ __curbulk_insert_row_skip_check : __curbulk_insert_row;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ c->close = __curbulk_close;
+
+ cbulk->bitmap = bitmap;
+ if (bitmap)
+ F_SET(c, WT_CURSTD_RAW);
+
+ return (__wt_bulk_init(session, cbulk));
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_config.c b/src/third_party/wiredtiger/src/cursor/cur_config.c
new file mode 100644
index 00000000000..868b144efc1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_config.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curconfig_close --
+ * WT_CURSOR->close method for the config cursor type.
+ */
+static int
+__curconfig_close(WT_CURSOR *cursor)
+{
+ return (__wt_cursor_close(cursor));
+}
+
+/*
+ * __wt_curconfig_open --
+ * WT_SESSION->open_cursor method for config cursors.
+ */
+int
+__wt_curconfig_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __wt_cursor_notsup, /* next */
+ __wt_cursor_notsup, /* prev */
+ __wt_cursor_noop, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curconfig_close);
+ WT_CURSOR_CONFIG *cconfig;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0);
+
+ WT_UNUSED(uri);
+
+ WT_RET(__wt_calloc_def(session, 1, &cconfig));
+
+ cursor = &cconfig->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->key_format = cursor->value_format = "S";
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ if (0) {
+err: __wt_free(session, cconfig);
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_ds.c b/src/third_party/wiredtiger/src/cursor/cur_ds.c
new file mode 100644
index 00000000000..33e89764617
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_ds.c
@@ -0,0 +1,524 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curds_txn_enter --
+ * Do transactional initialization when starting an operation.
+ */
+static int
+__curds_txn_enter(WT_SESSION_IMPL *session)
+{
+ session->ncursors++; /* XXX */
+ __wt_txn_cursor_op(session);
+
+ return (0);
+}
+
+/*
+ * __curds_txn_leave --
+ * Do transactional cleanup when ending an operation.
+ */
+static void
+__curds_txn_leave(WT_SESSION_IMPL *session)
+{
+ if (--session->ncursors == 0) /* XXX */
+ __wt_txn_read_last(session);
+}
+
+/*
+ * __curds_key_set --
+ * Set the key for the data-source.
+ */
+static int
+__curds_key_set(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ WT_CURSOR_NEEDKEY(cursor);
+
+ source->recno = cursor->recno;
+ source->key.data = cursor->key.data;
+ source->key.size = cursor->key.size;
+
+err: return (ret);
+}
+
+/*
+ * __curds_value_set --
+ * Set the value for the data-source.
+ */
+static int
+__curds_value_set(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ source->value.data = cursor->value.data;
+ source->value.size = cursor->value.size;
+
+err: return (ret);
+}
+
+/*
+ * __curds_cursor_resolve --
+ * Resolve cursor operation.
+ */
+static int
+__curds_cursor_resolve(WT_CURSOR *cursor, int ret)
+{
+ WT_CURSOR *source;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ /*
+ * Update the cursor's key, value and flags. (We use the _INT flags in
+ * the same way as file objects: there's some chance the underlying data
+ * source is passing us a reference to data only pinned per operation,
+ * might as well be safe.)
+ *
+ * There's also a requirement the underlying data-source never returns
+ * with the cursor/source key referencing application memory: it'd be
+ * great to do a copy as necessary here so the data-source doesn't have
+ * to worry about copying the key, but we don't have enough information
+ * to know if a cursor is pointing at application or data-source memory.
+ */
+ if (ret == 0) {
+ cursor->key.data = source->key.data;
+ cursor->key.size = source->key.size;
+ cursor->value.data = source->value.data;
+ cursor->value.size = source->value.size;
+ cursor->recno = source->recno;
+
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ } else {
+ if (ret == WT_NOTFOUND)
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ else
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+ /*
+ * Cursor operation failure implies a lost cursor position and
+ * a subsequent next/prev starting at the beginning/end of the
+ * table. We simplify underlying data source implementations
+ * by resetting the cursor explicitly here.
+ */
+ WT_TRET(source->reset(source));
+ }
+
+ return (ret);
+}
+
+/*
+ * __curds_compare --
+ * WT_CURSOR.compare method for the data-source cursor type.
+ */
+static int
+__curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_COLLATOR *collator;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * compare them.
+ */
+ if (strcmp(a->internal_uri, b->internal_uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Cursors must reference the same object");
+
+ WT_CURSOR_NEEDKEY(a);
+ WT_CURSOR_NEEDKEY(b);
+
+ if (WT_CURSOR_RECNO(a)) {
+ if (a->recno < b->recno)
+ *cmpp = -1;
+ else if (a->recno == b->recno)
+ *cmpp = 0;
+ else
+ *cmpp = 1;
+ } else {
+ /*
+ * The assumption is data-sources don't provide WiredTiger with
+ * WT_CURSOR.compare methods, instead, we'll copy the key/value
+ * out of the underlying data-source cursor and any comparison
+ * to be done can be done at this level.
+ */
+ collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator;
+ WT_ERR(__wt_compare(
+ session, collator, &a->key, &b->key, cmpp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curds_next --
+ * WT_CURSOR.next method for the data-source cursor type.
+ */
+static int
+__curds_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ ret = __curds_cursor_resolve(cursor, source->next(source));
+
+err: __curds_txn_leave(session);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_prev --
+ * WT_CURSOR.prev method for the data-source cursor type.
+ */
+static int
+__curds_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_prev);
+ WT_STAT_FAST_DATA_INCR(session, cursor_prev);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ ret = __curds_cursor_resolve(cursor, source->prev(source));
+
+err: __curds_txn_leave(session);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_reset --
+ * WT_CURSOR.reset method for the data-source cursor type.
+ */
+static int
+__curds_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_reset);
+ WT_STAT_FAST_DATA_INCR(session, cursor_reset);
+
+ WT_ERR(source->reset(source));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curds_search --
+ * WT_CURSOR.search method for the data-source cursor type.
+ */
+static int
+__curds_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->search(source));
+
+err: __curds_txn_leave(session);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_search_near --
+ * WT_CURSOR.search_near method for the data-source cursor type.
+ */
+static int
+__curds_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_search_near);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search_near);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ ret =
+ __curds_cursor_resolve(cursor, source->search_near(source, exact));
+
+err: __curds_txn_leave(session);
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curds_insert --
+ * WT_CURSOR.insert method for the data-source cursor type.
+ */
+static int
+__curds_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCR(session, cursor_insert);
+ WT_STAT_FAST_DATA_INCRV(session,
+ cursor_insert_bytes, cursor->key.size + cursor->value.size);
+
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND))
+ WT_ERR(__curds_key_set(cursor));
+ WT_ERR(__curds_value_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->insert(source));
+
+err: __curds_txn_leave(session);
+
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curds_update --
+ * WT_CURSOR.update method for the data-source cursor type.
+ */
+static int
+__curds_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCR(session, cursor_update);
+ WT_STAT_FAST_DATA_INCRV(
+ session, cursor_update_bytes, cursor->value.size);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ WT_ERR(__curds_value_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->update(source));
+
+err: __curds_txn_leave(session);
+
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curds_remove --
+ * WT_CURSOR.remove method for the data-source cursor type.
+ */
+static int
+__curds_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR *source;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCR(session, cursor_remove);
+ WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
+
+ WT_ERR(__curds_txn_enter(session));
+
+ WT_ERR(__curds_key_set(cursor));
+ ret = __curds_cursor_resolve(cursor, source->remove(source));
+
+err: __curds_txn_leave(session);
+
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curds_close --
+ * WT_CURSOR.close method for the data-source cursor type.
+ */
+static int
+__curds_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_DATA_SOURCE *cds;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cds = (WT_CURSOR_DATA_SOURCE *)cursor;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ if (cds->source != NULL)
+ ret = cds->source->close(cds->source);
+
+ if (cds->collator_owned) {
+ if (cds->collator->terminate != NULL)
+ WT_TRET(cds->collator->terminate(
+ cds->collator, &session->iface));
+ cds->collator_owned = 0;
+ }
+ cds->collator = NULL;
+
+ /*
+ * The key/value formats are in allocated memory, which isn't standard
+ * behavior.
+ */
+ __wt_free(session, cursor->key_format);
+ __wt_free(session, cursor->value_format);
+
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curds_open --
+ * Initialize a data-source cursor.
+ */
+int
+__wt_curds_open(
+ WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
+ const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curds_compare, /* compare */
+ __curds_next, /* next */
+ __curds_prev, /* prev */
+ __curds_reset, /* reset */
+ __curds_search, /* search */
+ __curds_search_near, /* search-near */
+ __curds_insert, /* insert */
+ __curds_update, /* update */
+ __curds_remove, /* remove */
+ __curds_close); /* close */
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor, *source;
+ WT_CURSOR_DATA_SOURCE *data_source;
+ WT_DECL_RET;
+ const char *metaconf;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);
+
+ data_source = NULL;
+ metaconf = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &data_source));
+ cursor = &data_source->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ F_SET(cursor, WT_CURSTD_DATA_SOURCE);
+
+ /*
+ * XXX
+ * The underlying data-source may require the object's key and value
+ * formats. This isn't a particularly elegant way of getting that
+ * information to the data-source, this feels like a layering problem
+ * to me.
+ */
+ WT_ERR(__wt_metadata_search(session, uri, &metaconf));
+ WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
+ WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
+ WT_ERR(
+ __wt_strndup(session, cval.str, cval.len, &cursor->value_format));
+
+ WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+ /* Data-source cursors have a collator reference. */
+ WT_ERR(__wt_collator_config(session, cfg,
+ &data_source->collator, &data_source->collator_owned));
+
+ WT_ERR(dsrc->open_cursor(dsrc,
+ &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
+ source = data_source->source;
+ source->session = (WT_SESSION *)session;
+ memset(&source->q, 0, sizeof(source->q));
+ source->recno = 0;
+ memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
+ memset(&source->key, 0, sizeof(source->key));
+ memset(&source->value, 0, sizeof(source->value));
+ source->saved_err = 0;
+ source->flags = 0;
+
+ if (0) {
+err: if (F_ISSET(cursor, WT_CURSTD_OPEN))
+ WT_TRET(cursor->close(cursor));
+ else
+ __wt_free(session, data_source);
+ *cursorp = NULL;
+ }
+
+ __wt_free(session, metaconf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_dump.c b/src/third_party/wiredtiger/src/cursor/cur_dump.c
new file mode 100644
index 00000000000..003b7e1f961
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_dump.c
@@ -0,0 +1,400 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __raw_to_dump --
+ * We have a buffer where the data item contains a raw value,
+ * convert it to a printable string.
+ */
+static int
+__raw_to_dump(
+ WT_SESSION_IMPL *session, WT_ITEM *from, WT_ITEM *to, int hexonly)
+{
+ if (hexonly)
+ WT_RET(__wt_raw_to_hex(session, from->data, from->size, to));
+ else
+ WT_RET(
+ __wt_raw_to_esc_hex(session, from->data, from->size, to));
+
+ return (0);
+}
+
+/*
+ * __dump_to_raw --
+ * We have a buffer containing a dump string,
+ * convert it to a raw value.
+ */
+static int
+__dump_to_raw(
+ WT_SESSION_IMPL *session, const char *src_arg, WT_ITEM *item, int hexonly)
+{
+ if (hexonly)
+ WT_RET(__wt_hex_to_raw(session, src_arg, item));
+ else
+ WT_RET(__wt_esc_hex_to_raw(session, src_arg, item));
+
+ return (0);
+}
+
+/*
+ * __curdump_get_key --
+ * WT_CURSOR->get_key for dump cursors.
+ */
+static int
+__curdump_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR *child;
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR_JSON *json;
+ WT_DECL_RET;
+ WT_ITEM item, *itemp;
+ WT_SESSION_IMPL *session;
+ size_t size;
+ uint64_t recno;
+ const char *fmt;
+ const void *buffer;
+ va_list ap;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+ json = (WT_CURSOR_JSON *)cursor->json_private;
+ WT_ASSERT(session, json != NULL);
+ if (WT_CURSOR_RECNO(cursor)) {
+ WT_ERR(child->get_key(child, &recno));
+ buffer = &recno;
+ size = sizeof(recno);
+ fmt = "R";
+ } else {
+ WT_ERR(__wt_cursor_get_raw_key(child, &item));
+ buffer = item.data;
+ size = item.size;
+ if (F_ISSET(cursor, WT_CURSTD_RAW))
+ fmt = "u";
+ else
+ fmt = cursor->key_format;
+ }
+ ret = __wt_json_alloc_unpack(session, buffer, size, fmt,
+ json, 1, ap);
+ } else {
+ if (WT_CURSOR_RECNO(cursor) &&
+ !F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(child->get_key(child, &recno));
+
+ WT_ERR(__wt_buf_fmt(session, &cursor->key, "%"
+ PRIu64, recno));
+ } else {
+ WT_ERR(child->get_key(child, &item));
+
+ WT_ERR(__raw_to_dump(session, &item, &cursor->key,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+ }
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ itemp = va_arg(ap, WT_ITEM *);
+ itemp->data = cursor->key.data;
+ itemp->size = cursor->key.size;
+ } else
+ *va_arg(ap, const char **) = cursor->key.data;
+ }
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * str2recno --
+ * Convert a string to a record number.
+ */
+static int
+str2recno(WT_SESSION_IMPL *session, const char *p, uint64_t *recnop)
+{
+ uint64_t recno;
+ char *endptr;
+
+ /*
+ * strtouq takes lots of things like hex values, signs and so on and so
+ * forth -- none of them are OK with us. Check the string starts with
+ * digit, that turns off the special processing.
+ */
+ if (!isdigit(p[0]))
+ goto format;
+
+ errno = 0;
+ recno = __wt_strtouq(p, &endptr, 0);
+ if (recno == ULLONG_MAX && errno == ERANGE)
+ WT_RET_MSG(session, ERANGE, "%s: invalid record number", p);
+ if (endptr[0] != '\0')
+format: WT_RET_MSG(session, EINVAL, "%s: invalid record number", p);
+
+ *recnop = recno;
+ return (0);
+}
+
+/*
+ * __curdump_set_key --
+ * WT_CURSOR->set_key for dump cursors.
+ */
+static void
+__curdump_set_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t recno;
+ va_list ap;
+ const char *p;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+ CURSOR_API_CALL(cursor, session, set_key, NULL);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW))
+ p = va_arg(ap, WT_ITEM *)->data;
+ else
+ p = va_arg(ap, const char *);
+ va_end(ap);
+
+ if (WT_CURSOR_RECNO(cursor) && !F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(str2recno(session, p, &recno));
+
+ child->set_key(child, recno);
+ } else {
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_to_item(session, p, cursor->key_format,
+ (WT_CURSOR_JSON *)cursor->json_private, 1,
+ &cursor->key));
+ else
+ WT_ERR(__dump_to_raw(session, p, &cursor->key,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+ child->set_key(child, &cursor->key);
+ }
+
+ if (0) {
+err: cursor->saved_err = ret;
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+ }
+ API_END(session, ret);
+}
+
+/*
+ * __curdump_get_value --
+ * WT_CURSOR->get_value for dump cursors.
+ */
+static int
+__curdump_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR_JSON *json;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_ITEM item, *itemp;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ const char *fmt;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+ json = (WT_CURSOR_JSON *)cursor->json_private;
+ WT_ASSERT(session, json != NULL);
+ WT_ERR(__wt_cursor_get_raw_value(child, &item));
+ fmt = F_ISSET(cursor, WT_CURSTD_RAW) ?
+ "u" : cursor->value_format;
+ ret = __wt_json_alloc_unpack(session, item.data,
+ item.size, fmt, json, 0, ap);
+ } else {
+ WT_ERR(child->get_value(child, &item));
+
+ WT_ERR(__raw_to_dump(session, &item, &cursor->value,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ itemp = va_arg(ap, WT_ITEM *);
+ itemp->data = cursor->value.data;
+ itemp->size = cursor->value.size;
+ } else
+ *va_arg(ap, const char **) = cursor->value.data;
+ }
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curdump_set_value --
+ * WT_CURSOR->set_value for dump cursors.
+ */
+static void
+__curdump_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ const char *p;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW))
+ p = va_arg(ap, WT_ITEM *)->data;
+ else
+ p = va_arg(ap, const char *);
+ va_end(ap);
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_to_item(session, p, cursor->value_format,
+ (WT_CURSOR_JSON *)cursor->json_private, 0, &cursor->value));
+ else
+ WT_ERR(__dump_to_raw(session, p, &cursor->value,
+ F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0));
+
+ child->set_value(child, &cursor->value);
+
+ if (0) {
+err: cursor->saved_err = ret;
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ }
+ API_END(session, ret);
+}
+
+/* Pass through a call to the underlying cursor. */
+#define WT_CURDUMP_PASS(op) \
+static int \
+__curdump_##op(WT_CURSOR *cursor) \
+{ \
+ WT_CURSOR *child; \
+ \
+ child = ((WT_CURSOR_DUMP *)cursor)->child; \
+ return (child->op(child)); \
+}
+
+WT_CURDUMP_PASS(next)
+WT_CURDUMP_PASS(prev)
+WT_CURDUMP_PASS(reset)
+WT_CURDUMP_PASS(search)
+
+/*
+ * __curdump_search_near --
+ * WT_CURSOR::search_near for dump cursors.
+ */
+static int
+__curdump_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_DUMP *cdump;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ return (cdump->child->search_near(cdump->child, exact));
+}
+
+WT_CURDUMP_PASS(insert)
+WT_CURDUMP_PASS(update)
+WT_CURDUMP_PASS(remove)
+
+/*
+ * __curdump_close --
+ * WT_CURSOR::close for dump cursors.
+ */
+static int
+__curdump_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR *child;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cdump = (WT_CURSOR_DUMP *)cursor;
+ child = cdump->child;
+
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+ if (child != NULL)
+ WT_TRET(child->close(child));
+ /* We shared the child's URI. */
+ cursor->internal_uri = NULL;
+ __wt_json_close(session, cursor);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curdump_create --
+ * initialize a dump cursor.
+ */
+int
+__wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __curdump_get_key, /* get-key */
+ __curdump_get_value, /* get-value */
+ __curdump_set_key, /* set-key */
+ __curdump_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curdump_next, /* next */
+ __curdump_prev, /* prev */
+ __curdump_reset, /* reset */
+ __curdump_search, /* search */
+ __curdump_search_near, /* search-near */
+ __curdump_insert, /* insert */
+ __curdump_update, /* update */
+ __curdump_remove, /* remove */
+ __curdump_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_DUMP *cdump;
+ WT_CURSOR_JSON *json;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ const char *cfg[2];
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_DUMP, iface) == 0);
+
+ session = (WT_SESSION_IMPL *)child->session;
+
+ WT_RET(__wt_calloc_def(session, 1, &cdump));
+ cursor = &cdump->iface;
+ *cursor = iface;
+ cursor->session = child->session;
+ cursor->internal_uri = child->internal_uri;
+ cursor->key_format = child->key_format;
+ cursor->value_format = child->value_format;
+ cdump->child = child;
+
+ /* Copy the dump flags from the child cursor. */
+ F_SET(cursor, F_ISSET(child,
+ WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_JSON | WT_CURSTD_DUMP_PRINT));
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) {
+ WT_ERR(__wt_calloc_def(session, 1, &json));
+ cursor->json_private = child->json_private = json;
+ }
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = NULL;
+ WT_ERR(__wt_cursor_init(cursor, NULL, owner, cfg, cursorp));
+
+ if (0) {
+err: __wt_free(session, cursor);
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c
new file mode 100644
index 00000000000..e5aaa19d0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_file.c
@@ -0,0 +1,471 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_BTREE_CURSOR_SAVE_AND_RESTORE
+ * Save the cursor's key/value data/size fields, call an underlying btree
+ * function, and then consistently handle failure and success.
+ */
+#define WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do { \
+ WT_ITEM __key_copy = (cursor)->key; \
+ uint64_t __recno = (cursor)->recno; \
+ WT_ITEM __value_copy = (cursor)->value; \
+ if (((ret) = (f)) == 0) { \
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \
+ } else { \
+ if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) { \
+ (cursor)->recno = __recno; \
+ WT_ITEM_SET((cursor)->key, __key_copy); \
+ } \
+ if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) \
+ WT_ITEM_SET((cursor)->value, __value_copy); \
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \
+ } \
+} while (0)
+
+/*
+ * __curfile_compare --
+ * WT_CURSOR->compare method for the btree cursor type.
+ */
+static int
+__curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)a;
+ CURSOR_API_CALL(a, session, compare, cbt->btree);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * call the underlying object to compare them.
+ */
+ if (strcmp(a->internal_uri, b->internal_uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Cursors must reference the same object");
+
+ WT_CURSOR_CHECKKEY(a);
+ WT_CURSOR_CHECKKEY(b);
+
+ ret = __wt_btcur_compare(
+ (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_next --
+ * WT_CURSOR->next method for the btree cursor type.
+ */
+static int
+__curfile_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, cbt->btree);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((ret = __wt_btcur_next(cbt, 0)) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_next_random --
+ * WT_CURSOR->next method for the btree cursor type when configured with
+ * next_random.
+ */
+static int
+__curfile_next_random(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, cbt->btree);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((ret = __wt_btcur_next_random(cbt)) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_prev --
+ * WT_CURSOR->prev method for the btree cursor type.
+ */
+static int
+__curfile_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, cbt->btree);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((ret = __wt_btcur_prev(cbt, 0)) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_reset --
+ * WT_CURSOR->reset method for the btree cursor type.
+ */
+static int
+__curfile_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, cbt->btree);
+
+ ret = __wt_btcur_reset(cbt);
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_search --
+ * WT_CURSOR->search method for the btree cursor type.
+ */
+static int
+__curfile_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, search, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_search_near --
+ * WT_CURSOR->search_near method for the btree cursor type.
+ */
+static int
+__curfile_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, search_near, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(
+ cursor, __wt_btcur_search_near(cbt, exact), ret);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curfile_insert --
+ * WT_CURSOR->insert method for the btree cursor type.
+ */
+static int
+__curfile_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree);
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND))
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret);
+
+ /*
+ * Insert is the one cursor operation that doesn't end with the cursor
+ * pointing to an on-page item. The standard macro handles errors
+ * correctly, but we need to leave the application cursor unchanged in
+ * the case of success, except for column-store appends, where we are
+ * returning a key.
+ */
+ if (ret == 0) {
+ if (!F_ISSET(cursor, WT_CURSTD_APPEND)) {
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+ F_CLR(cursor, WT_CURSTD_KEY_INT);
+ }
+ F_SET(cursor, WT_CURSTD_VALUE_EXT);
+ F_CLR(cursor, WT_CURSTD_VALUE_INT);
+ }
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curfile_update --
+ * WT_CURSOR->update method for the btree cursor type.
+ */
+static int
+__curfile_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __wt_curfile_update_check --
+ * WT_CURSOR->update_check method for the btree cursor type.
+ */
+int
+__wt_curfile_update_check(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(
+ cursor, __wt_btcur_update_check(cbt), ret);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curfile_remove --
+ * WT_CURSOR->remove method for the btree cursor type.
+ */
+static int
+__curfile_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+
+ WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret);
+
+ /*
+ * After a successful remove, copy the key: the value is not available.
+ */
+ if (ret == 0) {
+ if (F_ISSET(cursor, WT_CURSTD_KEY_INT) &&
+ !WT_DATA_IN_ITEM(&(cursor)->key)) {
+ WT_ERR(__wt_buf_set(session, &cursor->key,
+ cursor->key.data, cursor->key.size));
+ F_CLR(cursor, WT_CURSTD_KEY_INT);
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+ }
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ }
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curfile_close --
+ * WT_CURSOR->close method for the btree cursor type.
+ */
+static int
+__curfile_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cbt = (WT_CURSOR_BTREE *)cursor;
+ CURSOR_API_CALL(cursor, session, close, cbt->btree);
+ WT_TRET(__wt_btcur_close(cbt));
+ if (cbt->btree != NULL)
+ WT_TRET(__wt_session_release_btree(session));
+ /* The URI is owned by the btree handle. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curfile_create --
+ * Open a cursor for a given btree handle.
+ */
+int
+__wt_curfile_create(WT_SESSION_IMPL *session,
+ WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
+ WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curfile_compare, /* compare */
+ __curfile_next, /* next */
+ __curfile_prev, /* prev */
+ __curfile_reset, /* reset */
+ __curfile_search, /* search */
+ __curfile_search_near, /* search-near */
+ __curfile_insert, /* insert */
+ __curfile_update, /* update */
+ __curfile_remove, /* remove */
+ __curfile_close); /* close */
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_BTREE *cbt;
+ WT_CURSOR_BULK *cbulk;
+ WT_DECL_RET;
+ size_t csize;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);
+
+ cbt = NULL;
+
+ btree = S2BT(session);
+ WT_ASSERT(session, btree != NULL);
+
+ csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
+ WT_RET(__wt_calloc(session, 1, csize, &cbt));
+
+ cursor = &cbt->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->internal_uri = btree->dhandle->name;
+ cursor->key_format = btree->key_format;
+ cursor->value_format = btree->value_format;
+
+ cbt->btree = btree;
+ if (bulk) {
+ F_SET(cursor, WT_CURSTD_BULK);
+
+ cbulk = (WT_CURSOR_BULK *)cbt;
+
+ /* Optionally skip the validation of each bulk-loaded key. */
+ WT_ERR(__wt_config_gets_def(
+ session, cfg, "skip_sort_check", 0, &cval));
+ WT_ERR(__wt_curbulk_init(
+ session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
+ }
+
+ /*
+ * random_retrieval
+ * Random retrieval cursors only support next, reset and close.
+ */
+ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+ if (cval.val != 0) {
+ __wt_cursor_set_notsup(cursor);
+ cursor->next = __curfile_next_random;
+ cursor->reset = __curfile_reset;
+ }
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(
+ cursor, cursor->internal_uri, owner, cfg, cursorp));
+
+ WT_STAT_FAST_CONN_INCR(session, cursor_create);
+ WT_STAT_FAST_DATA_INCR(session, cursor_create);
+
+ if (0) {
+err: __wt_free(session, cbt);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_curfile_open --
+ * WT_SESSION->open_cursor method for the btree cursor type.
+ */
+int
+__wt_curfile_open(WT_SESSION_IMPL *session, const char *uri,
+ WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ int bitmap, bulk;
+ uint32_t flags;
+
+ flags = 0;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+ if (cval.type == WT_CONFIG_ITEM_BOOL ||
+ (cval.type == WT_CONFIG_ITEM_NUM &&
+ (cval.val == 0 || cval.val == 1))) {
+ bitmap = 0;
+ bulk = (cval.val != 0);
+ } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len))
+ bitmap = bulk = 1;
+ else
+ WT_RET_MSG(session, EINVAL,
+ "Value for 'bulk' must be a boolean or 'bitmap'");
+
+ /* Bulk handles require exclusive access. */
+ if (bulk)
+ LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE);
+
+ /* Get the handle and lock it while the cursor is using it. */
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, flags));
+ else
+ WT_RET(__wt_bad_object_type(session, uri));
+
+ WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp));
+
+ return (0);
+
+err: /* If the cursor could not be opened, release the handle. */
+ WT_TRET(__wt_session_release_btree(session));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_index.c b/src/third_party/wiredtiger/src/cursor/cur_index.c
new file mode 100644
index 00000000000..936337047b8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_index.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curindex_get_value --
+ * WT_CURSOR->get_value implementation for index cursors.
+ */
+static int
+__curindex_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ ret = __wt_schema_project_merge(session,
+ cindex->cg_cursors, cindex->value_plan,
+ cursor->value_format, &cursor->value);
+ if (ret == 0) {
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ }
+ } else
+ ret = __wt_schema_project_out(session,
+ cindex->cg_cursors, cindex->value_plan, ap);
+ va_end(ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_set_value --
+ * WT_CURSOR->set_value implementation for index cursors.
+ */
+static void
+__curindex_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+ ret = ENOTSUP;
+err: cursor->saved_err = ret;
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ API_END(session, ret);
+}
+
+/*
+ * __curindex_move --
+ * When an index cursor changes position, set the primary key in the
+ * associated column groups and update their positions to match.
+ */
+static int
+__curindex_move(WT_CURSOR_INDEX *cindex)
+{
+ WT_CURSOR **cp, *first;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = (WT_SESSION_IMPL *)cindex->iface.session;
+ first = NULL;
+
+ /* Point the public cursor to the key in the child. */
+ __wt_cursor_set_raw_key(&cindex->iface, &cindex->child->key);
+ F_CLR(&cindex->iface, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ for (i = 0, cp = cindex->cg_cursors;
+ i < WT_COLGROUPS(cindex->table);
+ i++, cp++) {
+ if (*cp == NULL)
+ continue;
+ if (first == NULL) {
+ /*
+ * Set the primary key -- note that we need the primary
+ * key columns, so we have to use the full key format,
+ * not just the public columns.
+ */
+ WT_RET(__wt_schema_project_slice(session,
+ cp, cindex->index->key_plan,
+ 1, cindex->index->key_format,
+ &cindex->iface.key));
+ first = *cp;
+ } else {
+ (*cp)->key.data = first->key.data;
+ (*cp)->key.size = first->key.size;
+ (*cp)->recno = first->recno;
+ }
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ WT_RET((*cp)->search(*cp));
+ }
+
+ F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ return (0);
+}
+
+/*
+ * __curindex_next --
+ * WT_CURSOR->next method for index cursors.
+ */
+static int
+__curindex_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ if ((ret = cindex->child->next(cindex->child)) == 0)
+ ret = __curindex_move(cindex);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_prev --
+ * WT_CURSOR->prev method for index cursors.
+ */
+static int
+__curindex_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ if ((ret = cindex->child->prev(cindex->child)) == 0)
+ ret = __curindex_move(cindex);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_reset --
+ * WT_CURSOR->reset method for index cursors.
+ */
+static int
+__curindex_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR **cp;
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ WT_TRET(cindex->child->reset(cindex->child));
+ for (i = 0, cp = cindex->cg_cursors;
+ i < WT_COLGROUPS(cindex->table);
+ i++, cp++) {
+ if (*cp == NULL)
+ continue;
+ WT_TRET((*cp)->reset(*cp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_search --
+ * WT_CURSOR->search method for index cursors.
+ */
+static int
+__curindex_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR *child;
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int exact;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ child = cindex->child;
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ /*
+ * We expect partial matches, but we want the smallest item that
+ * matches the prefix. Fail if there is no matching item.
+ */
+ __wt_cursor_set_raw_key(child, &cursor->key);
+ WT_ERR(child->search_near(child, &exact));
+
+ /*
+ * We expect partial matches, and want the smallest record with a key
+ * greater than or equal to the search key. The only way for the key
+ * to be equal is if there is an index on the primary key, because
+ * otherwise the primary key columns will be appended to the index key,
+ * but we don't disallow that (odd) case.
+ */
+ if (exact < 0)
+ WT_ERR(child->next(child));
+
+ if (child->key.size < cursor->key.size ||
+ memcmp(child->key.data, cursor->key.data, cursor->key.size) != 0) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+
+ WT_ERR(__curindex_move(cindex));
+
+ if (0) {
+err: F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ }
+
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_search_near --
+ * WT_CURSOR->search_near method for index cursors.
+ */
+static int
+__curindex_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ __wt_cursor_set_raw_key(cindex->child, &cursor->key);
+ if ((ret = cindex->child->search_near(cindex->child, exact)) == 0)
+ ret = __curindex_move(cindex);
+ else
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_close --
+ * WT_CURSOR->close method for index cursors.
+ */
+static int
+__curindex_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_INDEX *cindex;
+ WT_CURSOR **cp;
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ cindex = (WT_CURSOR_INDEX *)cursor;
+ idx = cindex->index;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ if ((cp = cindex->cg_cursors) != NULL)
+ for (i = 0, cp = cindex->cg_cursors;
+ i < WT_COLGROUPS(cindex->table); i++, cp++)
+ if (*cp != NULL) {
+ WT_TRET((*cp)->close(*cp));
+ *cp = NULL;
+ }
+
+ __wt_free(session, cindex->cg_cursors);
+ if (cindex->key_plan != idx->key_plan)
+ __wt_free(session, cindex->key_plan);
+ if (cursor->value_format != cindex->table->value_format)
+ __wt_free(session, cursor->value_format);
+ if (cindex->value_plan != idx->value_plan)
+ __wt_free(session, cindex->value_plan);
+
+ if (cindex->child != NULL)
+ WT_TRET(cindex->child->close(cindex->child));
+
+ __wt_schema_release_table(session, cindex->table);
+ /* The URI is owned by the index. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curindex_open_colgroups --
+ * Open cursors on the column groups required for an index cursor.
+ */
+static int
+__curindex_open_colgroups(
+ WT_SESSION_IMPL *session, WT_CURSOR_INDEX *cindex, const char *cfg_arg[])
+{
+ WT_TABLE *table;
+ WT_CURSOR **cp;
+ u_long arg;
+ /* Child cursors are opened with dump disabled. */
+ const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL };
+ char *proj;
+
+ table = cindex->table;
+ WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp));
+ cindex->cg_cursors = cp;
+
+ /* Work out which column groups we need. */
+ for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+ if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
+ cp[arg] != NULL)
+ continue;
+ WT_RET(__wt_open_cursor(session,
+ table->cgroups[arg]->source,
+ &cindex->iface, cfg, &cp[arg]));
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_curindex_open --
+ * WT_SESSION->open_cursor method for index cursors.
+ */
+int
+__wt_curindex_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __curindex_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __curindex_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curindex_next, /* next */
+ __curindex_prev, /* prev */
+ __curindex_reset, /* reset */
+ __curindex_search, /* search */
+ __curindex_search_near, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curindex_close); /* close */
+ WT_CURSOR_INDEX *cindex;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ const char *columns, *idxname, *tablename;
+ size_t namesize;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "index:") ||
+ (idxname = strchr(tablename, ':')) == NULL)
+ WT_RET_MSG(session, EINVAL, "Invalid cursor URI: '%s'", uri);
+ namesize = (size_t)(idxname - tablename);
+ ++idxname;
+
+ if ((ret = __wt_schema_get_table(session,
+ tablename, namesize, 0, &table)) != 0) {
+ if (ret == WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "Cannot open cursor '%s' on unknown table", uri);
+ return (ret);
+ }
+
+ columns = strchr(idxname, '(');
+ if (columns == NULL)
+ namesize = strlen(idxname);
+ else
+ namesize = (size_t)(columns - idxname);
+
+ WT_RET(__wt_schema_open_index(session, table, idxname, namesize, &idx));
+ WT_RET(__wt_calloc_def(session, 1, &cindex));
+
+ cursor = &cindex->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+
+ cindex->table = table;
+ cindex->index = idx;
+ cindex->key_plan = idx->key_plan;
+ cindex->value_plan = idx->value_plan;
+
+ cursor->internal_uri = idx->name;
+ cursor->key_format = idx->idxkey_format;
+ cursor->value_format = table->value_format;
+
+ /*
+ * XXX
+ * A very odd corner case is an index with a recno key.
+ * The only way to get here is by creating an index on a column store
+ * using only the primary's recno as the index key. Disallow that for
+ * now.
+ */
+ if (WT_CURSOR_RECNO(cursor))
+ WT_ERR_MSG(session, WT_ERROR,
+ "Column store indexes based on a record number primary "
+ "key are not supported.");
+
+ /* Handle projections. */
+ if (columns != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_struct_reformat(session, table,
+ columns, strlen(columns), NULL, 0, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cursor->value_format));
+
+ WT_ERR(__wt_buf_init(session, tmp, 0));
+ WT_ERR(__wt_struct_plan(session, table,
+ columns, strlen(columns), 0, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cindex->value_plan));
+ }
+
+ WT_ERR(__wt_cursor_init(
+ cursor, cursor->internal_uri, owner, cfg, cursorp));
+
+ WT_ERR(__wt_open_cursor(
+ session, idx->source, cursor, cfg, &cindex->child));
+
+ /* Open the column groups needed for this index cursor. */
+ WT_ERR(__curindex_open_colgroups(session, cindex, cfg));
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_column_init(cursor, table->key_format,
+ &idx->colconf, &table->colconf));
+
+ if (0) {
+err: WT_TRET(__curindex_close(cursor));
+ *cursorp = NULL;
+ }
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_json.c b/src/third_party/wiredtiger/src/cursor/cur_json.c
new file mode 100644
index 00000000000..f4459819259
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_json.c
@@ -0,0 +1,931 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t,
+ WT_CONFIG_ITEM *);
+static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t,
+ const char *, WT_CONFIG_ITEM *, int, size_t *);
+static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t,
+ const char *, WT_CONFIG_ITEM *, u_char *, size_t, int, va_list);
+static int json_string_arg(WT_SESSION_IMPL *, const char **, WT_ITEM *);
+static int json_int_arg(WT_SESSION_IMPL *, const char **, int64_t *);
+static int json_uint_arg(WT_SESSION_IMPL *, const char **, uint64_t *);
+static int __json_pack_struct(WT_SESSION_IMPL *, void *, size_t, const char *,
+ const char *);
+static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *,
+ int, const char *, size_t *);
+
+#define WT_PACK_JSON_GET(session, pv, jstr) do { \
+ switch (pv.type) { \
+ case 'x': \
+ break; \
+ case 's': \
+ case 'S': \
+ WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \
+ pv.type = pv.type == 's' ? 'j' : 'J'; \
+ break; \
+ case 'b': \
+ case 'h': \
+ case 'i': \
+ case 'l': \
+ case 'q': \
+ WT_RET(json_int_arg(session, &jstr, &pv.u.i)); \
+ break; \
+ case 'B': \
+ case 'H': \
+ case 'I': \
+ case 'L': \
+ case 'Q': \
+ case 'r': \
+ case 'R': \
+ case 't': \
+ WT_RET(json_uint_arg(session, &jstr, &pv.u.u)); \
+ break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
+ } \
+} while (0)
+
+/*
+ * __json_unpack_put --
+ * Calculate the size of a packed byte string as formatted for JSON.
+ */
+static size_t
+__json_unpack_put(WT_SESSION_IMPL *session, void *voidpv,
+ u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name)
+{
+ WT_PACK_VALUE *pv;
+ const char *p, *end;
+ size_t s, n;
+
+ pv = (WT_PACK_VALUE *)voidpv;
+ s = (size_t)snprintf((char *)buf, bufsz, "\"%.*s\" : ",
+ (int)name->len, name->str);
+ if (s <= bufsz) {
+ bufsz -= s;
+ buf += s;
+ }
+ else
+ bufsz = 0;
+
+ switch (pv->type) {
+ case 'x':
+ return (0);
+ case 's':
+ case 'S':
+ /* Account for '"' quote in front and back. */
+ s += 2;
+ p = (const char *)pv->u.s;
+ if (bufsz > 0) {
+ *buf++ = '"';
+ bufsz--;
+ }
+ if (pv->type == 's' || pv->havesize) {
+ end = p + pv->size;
+ for (; p < end; p++) {
+ n = __wt_json_unpack_char(*p, buf, bufsz, 0);
+ if (n > bufsz)
+ bufsz = 0;
+ else {
+ bufsz -= n;
+ buf += n;
+ }
+ s += n;
+ }
+ } else
+ for (; *p; p++) {
+ n = __wt_json_unpack_char(*p, buf, bufsz, 0);
+ if (n > bufsz)
+ bufsz = 0;
+ else {
+ bufsz -= n;
+ buf += n;
+ }
+ s += n;
+ }
+ if (bufsz > 0)
+ *buf++ = '"';
+ return (s);
+ case 'U':
+ case 'u':
+ s += 2;
+ p = (const char *)pv->u.item.data;
+ end = p + pv->u.item.size;
+ if (bufsz > 0) {
+ *buf++ = '"';
+ bufsz--;
+ }
+ for (; p < end; p++) {
+ n = __wt_json_unpack_char(*p, buf, bufsz, 1);
+ if (n > bufsz)
+ bufsz = 0;
+ else {
+ bufsz -= n;
+ buf += n;
+ }
+ s += n;
+ }
+ if (bufsz > 0)
+ *buf++ = '"';
+ return (s);
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ return (s +
+ (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.i));
+ case 'B':
+ case 't':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ case 'R':
+ return (s +
+ (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.u));
+ }
+ __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type);
+ return ((size_t)-1);
+}
+
+/*
+ * __json_struct_size --
+ * Calculate the size of a packed byte string as formatted for JSON.
+ */
+static inline int
+__json_struct_size(WT_SESSION_IMPL *session, const void *buffer,
+ size_t size, const char *fmt, WT_CONFIG_ITEM *names, int iskey,
+ size_t *presult)
+{
+ WT_CONFIG_ITEM name;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ WT_PACK_NAME packname;
+ const uint8_t *p, *end;
+ size_t result;
+ int needcr;
+
+ p = buffer;
+ end = p + size;
+ result = 0;
+ needcr = 0;
+
+ WT_RET(__pack_name_init(session, names, iskey, &packname));
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ if (needcr)
+ result += 2;
+ needcr = 1;
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+ WT_RET(__pack_name_next(&packname, &name));
+ result += __json_unpack_put(session, &pv, NULL, 0, &name);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ *presult = result;
+ return (ret);
+}
+
+/*
+ * __json_struct_unpackv --
+ * Unpack a byte string to JSON (va_list version).
+ */
+static inline int
+__json_struct_unpackv(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, WT_CONFIG_ITEM *names,
+ u_char *jbuf, size_t jbufsize, int iskey, va_list ap)
+{
+ WT_CONFIG_ITEM name;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ WT_PACK_NAME packname;
+ int needcr;
+ size_t jsize;
+ const uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+ needcr = 0;
+
+ /* Unpacking a cursor marked as json implies a single arg. */
+ *va_arg(ap, const char **) = (char *)jbuf;
+
+ WT_RET(__pack_name_init(session, names, iskey, &packname));
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ if (needcr) {
+ WT_ASSERT(session, jbufsize >= 3);
+ strncat((char *)jbuf, ",\n", jbufsize);
+ jbuf += 2;
+ jbufsize -= 2;
+ }
+ needcr = 1;
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+ WT_RET(__pack_name_next(&packname, &name));
+ jsize = __json_unpack_put(session,
+ (u_char *)&pv, jbuf, jbufsize, &name);
+ WT_ASSERT(session, jsize <= jbufsize);
+ jbuf += jsize;
+ jbufsize -= jsize;
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ /* Be paranoid - __unpack_read should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ WT_ASSERT(session, jbufsize == 1);
+
+ return (ret);
+}
+
+/*
+ * __wt_json_alloc_unpack --
+ * Allocate space for, and unpack an entry into JSON format.
+ */
+int
+__wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer,
+ size_t size, const char *fmt, WT_CURSOR_JSON *json,
+ int iskey, va_list ap)
+{
+ WT_CONFIG_ITEM *names;
+ WT_DECL_RET;
+ size_t needed;
+ char **json_bufp;
+
+ if (iskey) {
+ names = &json->key_names;
+ json_bufp = &json->key_buf;
+ } else {
+ names = &json->value_names;
+ json_bufp = &json->value_buf;
+ }
+ needed = 0;
+ WT_RET(__json_struct_size(session, buffer, size, fmt, names,
+ iskey, &needed));
+ WT_RET(__wt_realloc(session, NULL, needed + 1, json_bufp));
+ WT_RET(__json_struct_unpackv(session, buffer, size, fmt,
+ names, (u_char *)*json_bufp, needed + 1, iskey, ap));
+
+ return (ret);
+}
+
+/*
+ * __wt_json_close --
+ * Release any json related resources.
+ */
+void
+__wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+ WT_CURSOR_JSON *json;
+
+ if ((json = (WT_CURSOR_JSON *)cursor->json_private) != NULL) {
+ __wt_free(session, json->key_buf);
+ __wt_free(session, json->value_buf);
+ __wt_free(session, json);
+ }
+ return;
+}
+
+/*
+ * __wt_json_unpack_char --
+ * Unpack a single character into JSON escaped format.
+ * Can be called with null buf for sizing.
+ */
+size_t
+__wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode)
+{
+ char abbrev;
+ u_char h;
+
+ if (!force_unicode) {
+ if (isprint(ch) && ch != '\\' && ch != '"') {
+ if (bufsz >= 1)
+ *buf = (u_char)ch;
+ return (1);
+ } else {
+ abbrev = '\0';
+ switch (ch) {
+ case '\\':
+ case '"':
+ abbrev = ch;
+ break;
+ case '\f':
+ abbrev = 'f';
+ break;
+ case '\n':
+ abbrev = 'n';
+ break;
+ case '\r':
+ abbrev = 'r';
+ break;
+ case '\t':
+ abbrev = 't';
+ break;
+ }
+ if (abbrev != '\0') {
+ if (bufsz >= 2) {
+ *buf++ = '\\';
+ *buf = (u_char)abbrev;
+ }
+ return (2);
+ }
+ }
+ }
+ if (bufsz >= 6) {
+ *buf++ = '\\';
+ *buf++ = 'u';
+ *buf++ = '0';
+ *buf++ = '0';
+ h = (((u_char)ch) >> 4) & 0xF;
+ if (h >= 10)
+ *buf++ = 'A' + (h - 10);
+ else
+ *buf++ = '0' + h;
+ h = ((u_char)ch) & 0xF;
+ if (h >= 10)
+ *buf++ = 'A' + (h - 10);
+ else
+ *buf++ = '0' + h;
+ }
+ return (6);
+}
+
+/*
+ * __wt_json_column_init --
+ * set json_key_names, json_value_names to comma separated lists
+ * of column names.
+ */
+int
+__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat,
+ const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf)
+{
+ WT_CURSOR_JSON *json;
+ const char *p, *end, *beginkey;
+ uint32_t keycnt, nkeys;
+
+ json = (WT_CURSOR_JSON *)cursor->json_private;
+ beginkey = colconf->str;
+ end = beginkey + colconf->len;
+
+ if (idxconf != NULL) {
+ json->key_names.str = idxconf->str;
+ json->key_names.len = idxconf->len;
+ } else if (colconf->len > 0 && *beginkey == '(') {
+ beginkey++;
+ if (end[-1] == ')')
+ end--;
+ }
+
+ for (nkeys = 0; *keyformat; keyformat++)
+ if (!isdigit(*keyformat))
+ nkeys++;
+
+ p = beginkey;
+ keycnt = 0;
+ while (p < end && keycnt < nkeys) {
+ if (*p == ',')
+ keycnt++;
+ p++;
+ }
+ json->value_names.str = p;
+ json->value_names.len = WT_PTRDIFF(end, p);
+ if (idxconf == NULL) {
+ if (p > beginkey)
+ p--;
+ json->key_names.str = beginkey;
+ json->key_names.len = WT_PTRDIFF(p, beginkey);
+ }
+ return (0);
+}
+
+#define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \
+ size_t _kwlen = strlen(keyword); \
+ if (strncmp(in, keyword, _kwlen) == 0 && !isalnum(in[_kwlen])) { \
+ in += _kwlen; \
+ result = matchval; \
+ } else { \
+ const char *_bad = in; \
+ while (isalnum(*in)) \
+ in++; \
+ __wt_errx(session, "unknown keyword \"%.*s\" in JSON", \
+ (int)(in - _bad), _bad); \
+ } \
+} while (0)
+
+/*
+ * __wt_json_token --
+ * Return the type, start position and length of the next JSON
+ * token in the input. String tokens include the quotes. JSON
+ * can be entirely parsed using calls to this tokenizer, each
+ * call using a src pointer that is the previously returned
+ * tokstart + toklen.
+ *
+ * The token type returned is one of:
+ * 0 : EOF
+ * 's' : string
+ * 'i' : intnum
+ * 'f' : floatnum
+ * ':' : colon
+ * ',' : comma
+ * '{' : lbrace
+ * '}' : rbrace
+ * '[' : lbracket
+ * ']' : rbracket
+ * 'N' : null
+ * 'T' : true
+ * 'F' : false
+ */
+int
+__wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype,
+ const char **tokstart, size_t *toklen)
+{
+ WT_SESSION_IMPL *session;
+ char ch;
+ const char *bad;
+ int backslash, isalph, isfloat, result;
+
+ result = -1;
+ session = (WT_SESSION_IMPL *)wt_session;
+ while (isspace(*src))
+ src++;
+ *tokstart = src;
+
+ if (*src == '\0') {
+ *toktype = 0;
+ *toklen = 0;
+ return (0);
+ }
+
+ /* JSON is specified in RFC 4627. */
+ switch (*src) {
+ case '"':
+ backslash = 0;
+ src++;
+ while ((ch = *src) != '\0') {
+ if (!backslash) {
+ if (ch == '"') {
+ src++;
+ result = 's';
+ break;
+ }
+ if (ch == '\\')
+ backslash = 1;
+ } else {
+ /* We validate Unicode on this pass. */
+ if (ch == 'u') {
+ u_char ignored;
+ const u_char *uc;
+
+ uc = (const u_char *)src;
+ if (__wt_hex2byte(&uc[1], &ignored) ||
+ __wt_hex2byte(&uc[3], &ignored)) {
+ __wt_errx(session,
+ "invalid Unicode within JSON string");
+ return (-1);
+ }
+ src += 5;
+ }
+ backslash = 0;
+ }
+ src++;
+ }
+ if (result != 's')
+ __wt_errx(session, "unterminated string in JSON");
+ break;
+ case '-':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ isfloat = 0;
+ if (*src == '-')
+ src++;
+ while ((ch = *src) != '\0' && isdigit(ch))
+ src++;
+ if (*src == '.') {
+ isfloat = 1;
+ src++;
+ while ((ch = *src) != '\0' &&
+ isdigit(ch))
+ src++;
+ }
+ if (*src == 'e' || *src == 'E') {
+ isfloat = 1;
+ src++;
+ if (*src == '+' || *src == '-')
+ src++;
+ while ((ch = *src) != '\0' &&
+ isdigit(ch))
+ src++;
+ }
+ result = isfloat ? 'f' : 'i';
+ break;
+ case ':':
+ case ',':
+ case '{':
+ case '}':
+ case '[':
+ case ']':
+ result = *src++;
+ break;
+ case 'n':
+ MATCH_KEYWORD(session, src, result, "null", 'N');
+ break;
+ case 't':
+ MATCH_KEYWORD(session, src, result, "true", 'T');
+ break;
+ case 'f':
+ MATCH_KEYWORD(session, src, result, "false", 'F');
+ break;
+ default:
+ /* An illegal token, move past it anyway */
+ bad = src;
+ isalph = isalnum(*src);
+ src++;
+ if (isalph)
+ while (*src != '\0' && isalnum(*src))
+ src++;
+ __wt_errx(session, "unknown token \"%.*s\" in JSON",
+ (int)(src - bad), bad);
+ break;
+ }
+ *toklen = (size_t)(src - *tokstart);
+ *toktype = result;
+ return (result < 0 ? EINVAL : 0);
+}
+
+/*
+ * __wt_json_tokname
+ * Return a descriptive name from the token type returned by
+ * __wt_json_token
+ */
+const char *
+__wt_json_tokname(int toktype)
+{
+ switch (toktype) {
+ case 0: return ("<EOF>");
+ case 's': return ("<string>");
+ case 'i': return ("<integer>");
+ case 'f': return ("<float>");
+ case ':': return ("':'");
+ case ',': return ("','");
+ case '{': return ("'{'");
+ case '}': return ("'}'");
+ case '[': return ("'['");
+ case ']': return ("']'");
+ case 'N': return ("'null'");
+ case 'T': return ("'true'");
+ case 'F': return ("'false'");
+ default: return ("<UNKNOWN>");
+ }
+}
+
+/*
+ * json_string_arg --
+ * Returns a first cut of the needed string in item.
+ * The result has not been stripped of escapes.
+ */
+static int
+json_string_arg(WT_SESSION_IMPL *session, const char **jstr, WT_ITEM *item)
+{
+ const char *tokstart;
+ int tok;
+ WT_DECL_RET;
+
+ WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+ &item->size));
+ if (tok == 's') {
+ *jstr = tokstart + item->size;
+ /* The tokenizer includes the '"' chars */
+ item->data = tokstart + 1;
+ item->size -= 2;
+ ret = 0;
+ } else {
+ __wt_errx(session, "expected JSON <string>, got %s",
+ __wt_json_tokname(tok));
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * json_int_arg --
+ * Returns a signed integral value from the current position
+ * in the JSON string.
+ */
+static int
+json_int_arg(WT_SESSION_IMPL *session, const char **jstr, int64_t *ip)
+{
+ char *end;
+ const char *tokstart;
+ int tok;
+ size_t toksize;
+
+ WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+ &toksize));
+ if (tok == 'i') {
+ /* JSON only allows decimal */
+ *ip = strtoll(tokstart, &end, 10);
+ if (end != tokstart + toksize)
+ WT_RET_MSG(session, EINVAL,
+ "JSON <int> extraneous input");
+ *jstr = tokstart + toksize;
+ } else {
+ __wt_errx(session, "expected JSON <int>, got %s",
+ __wt_json_tokname(tok));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * json_uint_arg --
+ * Returns an unsigned integral value from the current position
+ * in the JSON string.
+ */
+static int
+json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up)
+{
+ char *end;
+ const char *tokstart;
+ int tok;
+ size_t toksize;
+
+ WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart,
+ &toksize));
+ if (tok == 'i' && *tokstart != '-') {
+ /* JSON only allows decimal */
+ *up = strtoull(tokstart, &end, 10);
+ if (end != tokstart + toksize)
+ WT_RET_MSG(session, EINVAL,
+ "JSON <int> extraneous input");
+ *jstr = tokstart + toksize;
+ } else {
+ __wt_errx(session, "expected unsigned JSON <int>, got %s",
+ __wt_json_tokname(tok));
+ return (EINVAL);
+ }
+ return (0);
+}
+
+#define JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do { \
+ int __tok; \
+ WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\
+ if (__tok != tokval) { \
+ __wt_errx(session, "expected JSON %s, got %s", \
+ __wt_json_tokname(tokval), __wt_json_tokname(__tok)); \
+ return (EINVAL); \
+ } \
+ jstr = start + sz; \
+} while (0)
+
+#define JSON_EXPECT_TOKEN(session, jstr, tokval) do { \
+ const char *__start; \
+ size_t __sz; \
+ JSON_EXPECT_TOKEN_GET(session, jstr, tokval, __start, __sz); \
+} while (0)
+
+/*
+ * __json_pack_struct --
+ * Pack a byte string from a JSON string.
+ */
+static int
+__json_pack_struct(WT_SESSION_IMPL *session, void *buffer, size_t size,
+ const char *fmt, const char *jstr)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ const char *tokstart;
+ int multi;
+ size_t toksize;
+ uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+ multi = 0;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+ /* the key name was verified in __json_pack_size */
+ JSON_EXPECT_TOKEN(session, jstr, ':');
+ pv.type = fmt[0];
+ WT_PACK_JSON_GET(session, pv, jstr);
+ return (__pack_write(session, &pv, &p, size));
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ if (multi)
+ JSON_EXPECT_TOKEN(session, jstr, ',');
+ JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+ /* the key name was verified in __json_pack_size */
+ JSON_EXPECT_TOKEN(session, jstr, ':');
+ WT_PACK_JSON_GET(session, pv, jstr);
+ WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
+ multi = 1;
+ }
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __json_pack_size --
+ * Calculate the size of a packed byte string from a JSON string.
+ * We verify that the names and value types provided in JSON match
+ * the column names and type from the schema format, returning error
+ * if not.
+ */
+static int
+__json_pack_size(
+ WT_SESSION_IMPL *session, const char *fmt, WT_CONFIG_ITEM *names,
+ int iskey, const char *jstr, size_t *sizep)
+{
+ WT_CONFIG_ITEM name;
+ WT_DECL_PACK_VALUE(pv);
+ WT_PACK pack;
+ WT_PACK_NAME packname;
+ const char *tokstart;
+ int multi;
+ size_t toksize, total;
+
+ WT_RET(__pack_name_init(session, names, iskey, &packname));
+ multi = 0;
+ WT_RET(__pack_init(session, &pack, fmt));
+ for (total = 0; __pack_next(&pack, &pv) == 0;) {
+ if (multi)
+ JSON_EXPECT_TOKEN(session, jstr, ',');
+ JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize);
+ WT_RET(__pack_name_next(&packname, &name));
+ if (toksize - 2 != name.len ||
+ strncmp(tokstart + 1, name.str, toksize - 2) != 0) {
+ __wt_errx(session, "JSON expected %s name: \"%.*s\"",
+ iskey ? "key" : "value", (int)name.len, name.str);
+ return (EINVAL);
+ }
+ JSON_EXPECT_TOKEN(session, jstr, ':');
+ WT_PACK_JSON_GET(session, pv, jstr);
+ total += __pack_size(session, &pv);
+ multi = 1;
+ }
+ /* check end of string */
+ JSON_EXPECT_TOKEN(session, jstr, 0);
+
+ *sizep = total;
+ return (0);
+}
+
+/*
+ * __wt_json_to_item --
+ * Convert a JSON input string for either key/value to a raw WT_ITEM.
+ * Checks that the input matches the expected format.
+ */
+int
+__wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr,
+ const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item)
+{
+ size_t sz;
+ sz = 0; /* Initialize because GCC 4.1 is paranoid */
+
+ WT_RET(__json_pack_size(session, format,
+ iskey ? &json->key_names : &json->value_names, iskey, jstr, &sz));
+ WT_RET(__wt_buf_initsize(session, item, sz));
+ WT_RET(__json_pack_struct(session, item->mem, sz, format, jstr));
+ return (0);
+}
+
+/*
+ * __wt_json_strlen --
+ * Return the number of bytes represented by a string in JSON format,
+ * or -1 if the format is incorrect.
+ */
+ssize_t
+__wt_json_strlen(const char *src, size_t srclen)
+{
+ const char *srcend;
+ size_t dstlen;
+ u_char hi, lo;
+
+ dstlen = 0;
+ srcend = src + srclen;
+ while (src < srcend) {
+ /* JSON can include any UTF-8 expressed in 4 hex chars. */
+ if (*src == '\\') {
+ if (*++src == 'u') {
+ if (__wt_hex2byte((const u_char *)++src, &hi))
+ return (-1);
+ src += 2;
+ if (__wt_hex2byte((const u_char *)src, &lo))
+ return (-1);
+ src += 2;
+ /* RFC 3629 */
+ if (hi >= 0x8) {
+ /* 3 bytes total */
+ dstlen += 2;
+ }
+ else if (hi != 0 || lo >= 0x80) {
+ /* 2 bytes total */
+ dstlen++;
+ }
+ /* else 1 byte total */
+ }
+ }
+ dstlen++;
+ src++;
+ }
+ if (src != srcend)
+ return (-1); /* invalid input, e.g. final char is '\\' */
+ return ((ssize_t)dstlen);
+}
+
+/*
+ * __wt_json_strncpy --
+ * Copy bytes of string in JSON format to a destination,
+ * up to dstlen bytes. If dstlen is greater than the needed size,
+ * the result if zero padded.
+ */
+int
+__wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen)
+{
+ char *dst;
+ const char *dstend, *srcend;
+ u_char hi, lo;
+
+ dst = *pdst;
+ dstend = dst + dstlen;
+ srcend = src + srclen;
+ while (src < srcend && dst < dstend) {
+ /* JSON can include any UTF-8 expressed in 4 hex chars. */
+ if (*src == '\\') {
+ if (*++src == 'u') {
+ if (__wt_hex2byte((const u_char *)++src, &hi))
+ return (EINVAL);
+ src += 2;
+ if (__wt_hex2byte((const u_char *)src, &lo))
+ return (EINVAL);
+ src += 2;
+ /* RFC 3629 */
+ if (hi >= 0x8) {
+ /* 3 bytes total */
+ /* byte 0: 1110HHHH */
+ /* byte 1: 10HHHHLL */
+ /* byte 2: 10LLLLLL */
+ *dst++ = (char)(0xe0 |
+ ((hi >> 4) & 0x0f));
+ *dst++ = (char)(0x80 |
+ ((hi << 2) & 0x3c) |
+ ((lo >> 6) & 0x03));
+ *dst++ = (char)(0x80 | (lo & 0x3f));
+ } else if (hi != 0 || lo >= 0x80) {
+ /* 2 bytes total */
+ /* byte 0: 110HHHLL */
+ /* byte 1: 10LLLLLL */
+ *dst++ = (char)(0xc0 |
+ (hi << 2) |
+ ((lo >> 6) & 0x03));
+ *dst++ = (char)(0x80 | (lo & 0x3f));
+ } else
+ /* else 1 byte total */
+ /* byte 0: 0LLLLLLL */
+ *dst++ = (char)lo;
+ }
+ else
+ *dst++ = *src;
+ } else
+ *dst++ = *src;
+ src++;
+ }
+ if (src != srcend)
+ return (ENOMEM);
+ *pdst = dst;
+ while (dst < dstend)
+ *dst++ = '\0';
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_log.c b/src/third_party/wiredtiger/src/cursor/cur_log.c
new file mode 100644
index 00000000000..803d68e890c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_log.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __curlog_logrec --
+ * Callback function from log_scan to get a log record.
+ */
+static int
+__curlog_logrec(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ WT_CURSOR_LOG *cl;
+
+ cl = cookie;
+
+ /* Set up the LSNs and take a copy of the log record for the cursor. */
+ *cl->cur_lsn = *lsnp;
+ *cl->next_lsn = *lsnp;
+ cl->next_lsn->offset += (wt_off_t)logrec->size;
+ WT_RET(__wt_buf_set(session, cl->logrec, logrec->data, logrec->size));
+
+ /*
+ * Read the log header. Set up the step pointers to walk the
+ * operations inside the record. Get the record type.
+ */
+ cl->stepp = LOG_SKIP_HEADER(cl->logrec->data);
+ cl->stepp_end = (uint8_t *)cl->logrec->data + logrec->size;
+ WT_RET(__wt_logrec_read(session, &cl->stepp, cl->stepp_end,
+ &cl->rectype));
+
+ /* A step count of 0 means the entire record. */
+ cl->step_count = 0;
+
+ /*
+ * Unpack the txnid so that we can return each
+ * individual operation for this txnid.
+ */
+ if (cl->rectype == WT_LOGREC_COMMIT)
+ WT_RET(__wt_vunpack_uint(&cl->stepp,
+ WT_PTRDIFF(cl->stepp_end, cl->stepp), &cl->txnid));
+ else {
+ /*
+ * Step over anything else.
+ * Setting stepp to NULL causes the next()
+ * method to read a new record on the next call.
+ */
+ cl->stepp = NULL;
+ cl->txnid = 0;
+ }
+ return (0);
+}
+
+/*
+ * __curlog_compare --
+ * WT_CURSOR.compare method for the log cursor type.
+ */
+static int
+__curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_LOG *acl, *bcl;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ acl = (WT_CURSOR_LOG *)a;
+ bcl = (WT_CURSOR_LOG *)b;
+ WT_ASSERT(session, cmpp != NULL);
+ *cmpp = LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+ /*
+ * If both are on the same LSN, compare step counter.
+ */
+ if (*cmpp == 0)
+ *cmpp = (acl->step_count != bcl->step_count ?
+ (acl->step_count < bcl->step_count ? -1 : 1) : 0);
+err: API_END_RET(session, ret);
+
+}
+
+/*
+ * __curlog_op_read --
+ * Read out any key/value from an individual operation record
+ * in the log. We're only interested in put and remove operations
+ * since truncate is not a cursor operation. All successful
+ * returns from this function will have set up the cursor copy of
+ * key and value to give the user.
+ */
+static int
+__curlog_op_read(WT_SESSION_IMPL *session,
+ WT_CURSOR_LOG *cl, uint32_t optype, uint32_t opsize, uint32_t *fileid)
+{
+ WT_ITEM key, value;
+ uint64_t recno;
+ const uint8_t *end, *pp;
+
+ pp = cl->stepp;
+ end = pp + opsize;
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_RET(__wt_logop_col_put_unpack(session, &pp, end,
+ fileid, &recno, &value));
+ WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+ WT_RET(__wt_buf_set(session,
+ cl->opvalue, value.data, value.size));
+ break;
+ case WT_LOGOP_COL_REMOVE:
+ WT_RET(__wt_logop_col_remove_unpack(session, &pp, end,
+ fileid, &recno));
+ WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno)));
+ WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
+ break;
+ case WT_LOGOP_ROW_PUT:
+ WT_RET(__wt_logop_row_put_unpack(session, &pp, end,
+ fileid, &key, &value));
+ WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+ WT_RET(__wt_buf_set(session,
+ cl->opvalue, value.data, value.size));
+ break;
+ case WT_LOGOP_ROW_REMOVE:
+ WT_RET(__wt_logop_row_remove_unpack(session, &pp, end,
+ fileid, &key));
+ WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size));
+ WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0));
+ break;
+ default:
+ /*
+ * Any other operations return the record in the value
+ * and an empty key.
+ */
+ *fileid = 0;
+ WT_RET(__wt_buf_set(session, cl->opkey, NULL, 0));
+ WT_RET(__wt_buf_set(session, cl->opvalue, cl->stepp, opsize));
+ }
+ return (0);
+}
+
+/*
+ * __curlog_kv --
+ * Set the key and value of the log cursor to return to the user.
+ */
+static int
+__curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+ uint32_t fileid, key_count, opsize, optype;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+ /*
+ * If it is a commit and we have stepped over the header, peek to get
+ * the size and optype and read out any key/value from this operation.
+ */
+ if ((key_count = cl->step_count++) > 0) {
+ WT_RET(__wt_logop_read(session,
+ &cl->stepp, cl->stepp_end, &optype, &opsize));
+ WT_RET(__curlog_op_read(session, cl, optype, opsize, &fileid));
+ /* Position on the beginning of the next record part. */
+ cl->stepp += opsize;
+ } else {
+ optype = WT_LOGOP_INVALID;
+ fileid = 0;
+ cl->opkey->data = NULL;
+ cl->opkey->size = 0;
+ /*
+ * Non-commit records we want to return the record without the
+ * header and the adjusted size. Add one to skip over the type
+ * which is normally consumed by __wt_logrec_read.
+ */
+ cl->opvalue->data = LOG_SKIP_HEADER(cl->logrec->data) + 1;
+ cl->opvalue->size = LOG_REC_SIZE(cl->logrec->size) - 1;
+ }
+ /*
+ * The log cursor sets the LSN and step count as the cursor key and
+ * and log record related data in the value. The data in the value
+ * contains any operation key/value that was in the log record.
+ */
+ __wt_cursor_set_key(cursor, cl->cur_lsn->file, cl->cur_lsn->offset,
+ key_count);
+ __wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype,
+ fileid, cl->opkey, cl->opvalue);
+ return (0);
+}
+
+/*
+ * __curlog_next --
+ * WT_CURSOR.next method for the step log cursor type.
+ */
+static int
+__curlog_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ /*
+ * If we don't have a record, or went to the end of the record we
+ * have, or we are in the zero-fill portion of the record, get a
+ * new one.
+ */
+ if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) {
+ cl->txnid = 0;
+ WT_ERR(__wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE,
+ __curlog_logrec, cl));
+ }
+ WT_ASSERT(session, cl->logrec->data != NULL);
+ WT_ERR(__curlog_kv(session, cursor));
+ WT_STAT_FAST_CONN_INCR(session, cursor_next);
+ WT_STAT_FAST_DATA_INCR(session, cursor_next);
+
+err: API_END_RET(session, ret);
+
+}
+
+/*
+ * __curlog_search --
+ * WT_CURSOR.search method for the log cursor type.
+ */
+static int
+__curlog_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_LSN key;
+ WT_SESSION_IMPL *session;
+ uint32_t counter;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ /*
+ * !!! We are ignoring the counter and only searching based on the LSN.
+ */
+ WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl,
+ &key.file, &key.offset, &counter));
+ WT_ERR(__wt_log_scan(session, &key, WT_LOGSCAN_ONE,
+ __curlog_logrec, cl));
+ WT_ERR(__curlog_kv(session, cursor));
+ WT_STAT_FAST_CONN_INCR(session, cursor_search);
+ WT_STAT_FAST_DATA_INCR(session, cursor_search);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curlog_reset --
+ * WT_CURSOR.reset method for the log cursor type.
+ */
+static int
+__curlog_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LOG *cl;
+
+ cl = (WT_CURSOR_LOG *)cursor;
+ cl->stepp = cl->stepp_end = NULL;
+ cl->step_count = 0;
+ INIT_LSN(cl->cur_lsn);
+ INIT_LSN(cl->next_lsn);
+ return (0);
+}
+
+/*
+ * __curlog_close --
+ * WT_CURSOR.close method for the log cursor type.
+ */
+static int
+__curlog_close(WT_CURSOR *cursor)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(cursor, session, close, NULL);
+ cl = (WT_CURSOR_LOG *)cursor;
+ conn = S2C(session);
+ WT_ASSERT(session, conn->logging);
+ log = conn->log;
+ WT_TRET(__wt_readunlock(session, log->log_archive_lock));
+ WT_TRET(__curlog_reset(cursor));
+ __wt_free(session, cl->cur_lsn);
+ __wt_free(session, cl->next_lsn);
+ __wt_scr_free(&cl->logrec);
+ __wt_scr_free(&cl->opkey);
+ __wt_scr_free(&cl->opvalue);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curlog_open --
+ * Initialize a log cursor.
+ */
+int
+__wt_curlog_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curlog_compare, /* compare */
+ __curlog_next, /* next */
+ __wt_cursor_notsup, /* prev */
+ __curlog_reset, /* reset */
+ __curlog_search, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curlog_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_LOG *cl;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);
+ conn = S2C(session);
+ if (!conn->logging)
+ WT_RET_MSG(session, EINVAL,
+ "Cannot open a log cursor without logging enabled");
+
+ log = conn->log;
+ cl = NULL;
+ WT_RET(__wt_calloc_def(session, 1, &cl));
+ cursor = &cl->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ WT_ERR(__wt_calloc_def(session, 1, &cl->cur_lsn));
+ WT_ERR(__wt_calloc_def(session, 1, &cl->next_lsn));
+ WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
+ WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
+ WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
+ cursor->key_format = LOGC_KEY_FORMAT;
+ cursor->value_format = LOGC_VALUE_FORMAT;
+
+ INIT_LSN(cl->cur_lsn);
+ INIT_LSN(cl->next_lsn);
+
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ /* Log cursors are read only. */
+ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));
+ /* Log cursors block archiving. */
+ WT_ERR(__wt_readlock(session, log->log_archive_lock));
+
+ if (0) {
+err: if (F_ISSET(cursor, WT_CURSTD_OPEN))
+ WT_TRET(cursor->close(cursor));
+ else {
+ __wt_free(session, cl->cur_lsn);
+ __wt_free(session, cl->next_lsn);
+ __wt_scr_free(&cl->logrec);
+ __wt_scr_free(&cl->opkey);
+ __wt_scr_free(&cl->opvalue);
+ /*
+ * NOTE: We cannot get on the error path with the
+ * readlock held. No need to unlock it unless that
+ * changes above.
+ */
+ __wt_free(session, cl);
+ }
+ *cursorp = NULL;
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_metadata.c b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
new file mode 100644
index 00000000000..30fe3b28625
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_metadata.c
@@ -0,0 +1,444 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Custom NEED macros for metadata cursors - that copy the values into the
+ * backing metadata table cursor.
+ */
+#define WT_MD_CURSOR_NEEDKEY(cursor) do { \
+ WT_CURSOR_NEEDKEY(cursor); \
+ WT_ERR(__wt_buf_set(session, \
+ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->key, \
+ cursor->key.data, cursor->key.size)); \
+ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \
+ WT_CURSTD_KEY_EXT); \
+} while (0)
+
+#define WT_MD_CURSOR_NEEDVALUE(cursor) do { \
+ WT_CURSOR_NEEDVALUE(cursor); \
+ WT_ERR(__wt_buf_set(session, \
+ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->value, \
+ cursor->value.data, cursor->value.size)); \
+ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \
+ WT_CURSTD_VALUE_EXT); \
+} while (0)
+
+#define WT_MD_SET_KEY_VALUE(c, mc, fc) do { \
+ (c)->key.data = (fc)->key.data; \
+ (c)->key.size = (fc)->key.size; \
+ (c)->value.data = (fc)->value.data; \
+ (c)->value.size = (fc)->value.size; \
+ F_SET((c), WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \
+ F_CLR((mc), WT_MDC_ONMETADATA); \
+ F_SET((mc), WT_MDC_POSITIONED); \
+} while (0)
+
+/*
+ * Check if a key matches the metadata. The public value is "metadata:",
+ * but also check for the internal version of the URI.
+ */
+#define WT_KEY_IS_METADATA(key) \
+ (WT_STRING_MATCH(WT_METADATA_URI, (key)->data, (key)->size - 1) ||\
+ WT_STRING_MATCH(WT_METAFILE_URI, (key)->data, (key)->size - 1))
+
+/*
+ * __curmetadata_metadata_search --
+ * Retrieve the metadata for the metadata table
+ */
+static int
+__curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
+{
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ const char *value;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+
+ /* The metadata search interface allocates a new string in value. */
+ WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value));
+
+ /*
+ * Copy the value to the underlying btree cursor's tmp item which will
+ * be freed when the cursor is closed.
+ */
+ ret = __wt_buf_setstr(session, &cursor->value, value);
+ __wt_free(session, value);
+ WT_RET(ret);
+
+ WT_RET(__wt_buf_setstr(session, &cursor->key, WT_METADATA_URI));
+
+ F_SET(mdc, WT_MDC_ONMETADATA | WT_MDC_POSITIONED);
+ F_SET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ return (0);
+}
+
+/*
+ * __curmetadata_compare --
+ * WT_CURSOR->compare method for the metadata cursor type.
+ */
+static int
+__curmetadata_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR *a_file_cursor, *b_file_cursor;
+ WT_CURSOR_METADATA *a_mdc, *b_mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ a_mdc = ((WT_CURSOR_METADATA *)a);
+ b_mdc = ((WT_CURSOR_METADATA *)b);
+ a_file_cursor = a_mdc->file_cursor;
+ b_file_cursor = b_mdc->file_cursor;
+
+ CURSOR_API_CALL(a, session,
+ compare, ((WT_CURSOR_BTREE *)a_file_cursor)->btree);
+
+ if (b->compare != __curmetadata_compare)
+ WT_ERR_MSG(session, EINVAL,
+ "Can only compare cursors of the same type");
+
+ WT_MD_CURSOR_NEEDKEY(a);
+ WT_MD_CURSOR_NEEDKEY(b);
+
+ if (F_ISSET(a_mdc, WT_MDC_ONMETADATA)) {
+ if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
+ *cmpp = 0;
+ else
+ *cmpp = 1;
+ } else if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
+ *cmpp = -1;
+ else
+ ret = a_file_cursor->compare(
+ a_file_cursor, b_file_cursor, cmpp);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_next --
+ * WT_CURSOR->next method for the metadata cursor type.
+ */
+static int
+__curmetadata_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ next, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ if (!F_ISSET(mdc, WT_MDC_POSITIONED))
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+ else {
+ WT_ERR(file_cursor->next(mdc->file_cursor));
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ }
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_prev --
+ * WT_CURSOR->prev method for the metadata cursor type.
+ */
+static int
+__curmetadata_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ prev, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ if (F_ISSET(mdc, WT_MDC_ONMETADATA)) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+
+ ret = file_cursor->prev(file_cursor);
+ if (ret == 0) {
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ } else if (ret == WT_NOTFOUND)
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_reset --
+ * WT_CURSOR->reset method for the metadata cursor type.
+ */
+static int
+__curmetadata_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ reset, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ if (F_ISSET(mdc, WT_MDC_POSITIONED) && !F_ISSET(mdc, WT_MDC_ONMETADATA))
+ ret = file_cursor->reset(file_cursor);
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_search --
+ * WT_CURSOR->search method for the metadata cursor type.
+ */
+static int
+__curmetadata_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ search, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+
+ if (WT_KEY_IS_METADATA(&cursor->key))
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+ else {
+ WT_ERR(file_cursor->search(file_cursor));
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ }
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_search_near --
+ * WT_CURSOR->search_near method for the metadata cursor type.
+ */
+static int
+__curmetadata_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ search_near, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+
+ if (WT_KEY_IS_METADATA(&cursor->key)) {
+ WT_ERR(__curmetadata_metadata_search(session, cursor));
+ *exact = 1;
+ } else {
+ WT_ERR(file_cursor->search_near(file_cursor, exact));
+ WT_MD_SET_KEY_VALUE(cursor, mdc, file_cursor);
+ }
+
+err: if (ret != 0) {
+ F_CLR(mdc, WT_MDC_POSITIONED | WT_MDC_ONMETADATA);
+ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ }
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_insert --
+ * WT_CURSOR->insert method for the metadata cursor type.
+ */
+static int
+__curmetadata_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ insert, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+ WT_MD_CURSOR_NEEDVALUE(cursor);
+
+ /*
+ * Since the key/value formats are 's' the WT_ITEMs must contain a
+ * NULL terminated string.
+ */
+ ret =
+ __wt_metadata_insert(session, cursor->key.data, cursor->value.data);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_update --
+ * WT_CURSOR->update method for the metadata cursor type.
+ */
+static int
+__curmetadata_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ update, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+ WT_MD_CURSOR_NEEDVALUE(cursor);
+
+ /*
+ * Since the key/value formats are 's' the WT_ITEMs must contain a
+ * NULL terminated string.
+ */
+ ret =
+ __wt_metadata_update(session, cursor->key.data, cursor->value.data);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_remove --
+ * WT_CURSOR->remove method for the metadata cursor type.
+ */
+static int
+__curmetadata_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ remove, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ WT_MD_CURSOR_NEEDKEY(cursor);
+
+ /*
+ * Since the key format is 's' the WT_ITEM must contain a NULL
+ * terminated string.
+ */
+ ret = __wt_metadata_remove(session, cursor->key.data);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curmetadata_close --
+ * WT_CURSOR->close method for the metadata cursor type.
+ */
+static int
+__curmetadata_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ mdc = (WT_CURSOR_METADATA *)cursor;
+ file_cursor = mdc->file_cursor;
+ CURSOR_API_CALL(cursor, session,
+ close, ((WT_CURSOR_BTREE *)file_cursor)->btree);
+
+ ret = file_cursor->close(file_cursor);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curmetadata_open --
+ * WT_SESSION->open_cursor method for metadata cursors.
+ *
+ * Metadata cursors are a similar to a file cursor on the special metadata
+ * table, except that the metadata for the metadata table (which is stored
+ * in the turtle file) can also be queried.
+ *
+ * Metadata cursors are read-only by default.
+ */
+int
+__wt_curmetadata_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __curmetadata_compare, /* compare */
+ __curmetadata_next, /* next */
+ __curmetadata_prev, /* prev */
+ __curmetadata_reset, /* reset */
+ __curmetadata_search, /* search */
+ __curmetadata_search_near, /* search-near */
+ __curmetadata_insert, /* insert */
+ __curmetadata_update, /* update */
+ __curmetadata_remove, /* remove */
+ __curmetadata_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_METADATA *mdc;
+ WT_DECL_RET;
+
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_CURSOR_METADATA), &mdc));
+
+ cursor = &mdc->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->key_format = "S";
+ cursor->value_format = "S";
+
+ /* Open the file cursor for operations on the regular metadata */
+ WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor));
+
+ WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));
+
+ /* Metadata cursors default to read only. */
+ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));
+
+ if (0) {
+err: __wt_free(session, mdc);
+ }
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
new file mode 100644
index 00000000000..c06efced369
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -0,0 +1,574 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __curstat_next(WT_CURSOR *cursor);
+static int __curstat_prev(WT_CURSOR *cursor);
+
+/*
+ * The statistics identifier is an offset from a base to ensure the integer ID
+ * values don't overlap (the idea is if they overlap it's easy for application
+ * writers to confuse them).
+ */
+#define WT_STAT_KEY_MAX(cst) (((cst)->stats_base + (cst)->stats_count) - 1)
+#define WT_STAT_KEY_MIN(cst) ((cst)->stats_base)
+#define WT_STAT_KEY_OFFSET(cst) ((cst)->key - (cst)->stats_base)
+
+/*
+ * __curstat_print_value --
+ * Convert statistics cursor value to printable format.
+ */
+static int
+__curstat_print_value(WT_SESSION_IMPL *session, uint64_t v, WT_ITEM *buf)
+{
+ if (v >= WT_BILLION)
+ WT_RET(__wt_buf_fmt(session, buf,
+ "%" PRIu64 "B (%" PRIu64 ")", v / WT_BILLION, v));
+ else if (v >= WT_MILLION)
+ WT_RET(__wt_buf_fmt(session, buf,
+ "%" PRIu64 "M (%" PRIu64 ")", v / WT_MILLION, v));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%" PRIu64, v));
+
+ return (0);
+}
+
+/*
+ * __curstat_get_key --
+ * WT_CURSOR->get_key for statistics cursors.
+ */
+static int
+__curstat_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ size_t size;
+ va_list ap;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+
+ WT_CURSOR_NEEDKEY(cursor);
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(__wt_struct_size(
+ session, &size, cursor->key_format, cst->key));
+ WT_ERR(__wt_buf_initsize(session, &cursor->key, size));
+ WT_ERR(__wt_struct_pack(session, cursor->key.mem, size,
+ cursor->key_format, cst->key));
+
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->key.data;
+ item->size = cursor->key.size;
+ } else
+ *va_arg(ap, int *) = cst->key;
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_get_value --
+ * WT_CURSOR->get_value for statistics cursors.
+ */
+static int
+__curstat_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ size_t size;
+ uint64_t *v;
+ const char **p;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ va_start(ap, cursor);
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ WT_CURSOR_NEEDVALUE(cursor);
+
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
+ cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+ cst->pv.data, cst->v));
+ WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
+ WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
+ cursor->value_format,
+ cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+ cst->pv.data, cst->v));
+
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ } else {
+ /*
+ * Don't drop core if the statistics value isn't requested; NULL
+ * pointer support isn't documented, but it's a cheap test.
+ */
+ if ((p = va_arg(ap, const char **)) != NULL)
+ *p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc;
+ if ((p = va_arg(ap, const char **)) != NULL)
+ *p = cst->pv.data;
+ if ((v = va_arg(ap, uint64_t *)) != NULL)
+ *v = cst->v;
+ }
+
+err: va_end(ap);
+ API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_set_key --
+ * WT_CURSOR->set_key for statistics cursors.
+ */
+static void
+__curstat_set_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, set_key, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSTD_RAW)) {
+ item = va_arg(ap, WT_ITEM *);
+ ret = __wt_struct_unpack(session, item->data, item->size,
+ cursor->key_format, &cst->key);
+ } else
+ cst->key = va_arg(ap, int);
+ va_end(ap);
+
+ if ((cursor->saved_err = ret) == 0)
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+
+err: API_END(session, ret);
+}
+
+/*
+ * __curstat_set_value --
+ * WT_CURSOR->set_value for statistics cursors.
+ */
+static void
+__curstat_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_UNUSED(cursor);
+ return;
+}
+
+/*
+ * __curstat_next --
+ * WT_CURSOR->next method for the statistics cursor type.
+ */
+static int
+__curstat_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+
+ /* Move to the next item. */
+ if (cst->notpositioned) {
+ cst->notpositioned = 0;
+ cst->key = WT_STAT_KEY_MIN(cst);
+ } else if (cst->key < WT_STAT_KEY_MAX(cst))
+ ++cst->key;
+ else {
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ WT_ERR(WT_NOTFOUND);
+ }
+ cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_prev --
+ * WT_CURSOR->prev method for the statistics cursor type.
+ */
+static int
+__curstat_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+
+ /* Move to the previous item. */
+ if (cst->notpositioned) {
+ cst->notpositioned = 0;
+ cst->key = WT_STAT_KEY_MAX(cst);
+ } else if (cst->key > WT_STAT_KEY_MIN(cst))
+ --cst->key;
+ else {
+ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ WT_ERR(WT_NOTFOUND);
+ }
+
+ cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_reset --
+ * WT_CURSOR->reset method for the statistics cursor type.
+ */
+static int
+__curstat_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+
+ cst->notpositioned = 1;
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_search --
+ * WT_CURSOR->search method for the statistics cursor type.
+ */
+static int
+__curstat_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, search, NULL);
+
+ WT_CURSOR_NEEDKEY(cursor);
+ F_CLR(cursor, WT_CURSTD_VALUE_SET | WT_CURSTD_VALUE_SET);
+
+ if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst))
+ WT_ERR(WT_NOTFOUND);
+
+ cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_close --
+ * WT_CURSOR->close method for the statistics cursor type.
+ */
+static int
+__curstat_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cst = (WT_CURSOR_STAT *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ __wt_buf_free(session, &cst->pv);
+
+ WT_ERR(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curstat_conn_init --
+ * Initialize the statistics for a connection.
+ */
+static void
+__curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * Fill in the connection statistics, and copy them to the cursor.
+ * Optionally clear the connection statistics.
+ */
+ __wt_conn_stat_init(session);
+ cst->u.conn_stats = conn->stats;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ __wt_stat_refresh_connection_stats(&conn->stats);
+
+ cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats;
+ cst->stats_base = WT_CONNECTION_STATS_BASE;
+ cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS);
+}
+
+/*
+ * When returning the statistics for a file URI, we review open handles, and
+ * aggregate checkpoint handle statistics with the file URI statistics. To
+ * make that work, we have to pass information to the function reviewing the
+ * handles, this structure is what we pass.
+ */
+struct __checkpoint_args {
+ const char *name; /* Data source handle name */
+ WT_DSRC_STATS *stats; /* Stat structure being filled */
+ int clear; /* WT_STATISTICS_CLEAR */
+};
+
+/*
+ * __curstat_checkpoint --
+ * Aggregate statistics from checkpoint handles.
+ */
+static int
+__curstat_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ struct __checkpoint_args *args;
+ WT_DATA_HANDLE *dhandle;
+
+ dhandle = session->dhandle;
+ args = (struct __checkpoint_args *)cfg[0];
+
+ /* Aggregate the flagged file's checkpoint handles. */
+ if (dhandle->checkpoint != NULL &&
+ strcmp(dhandle->name, args->name) == 0) {
+ __wt_stat_aggregate_dsrc_stats(&dhandle->stats, args->stats);
+ if (args->clear)
+ __wt_stat_refresh_dsrc_stats(&dhandle->stats);
+ }
+
+ return (0);
+}
+
+/*
+ * __curstat_file_init --
+ * Initialize the statistics for a file.
+ */
+static int
+__curstat_file_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ struct __checkpoint_args args;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_RET;
+ const char *cfg_arg[] = { NULL, NULL };
+
+ WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0));
+ dhandle = session->dhandle;
+
+ /*
+ * Fill in the data source statistics, and copy them to the cursor.
+ * Optionally clear the data source statistics.
+ */
+ if ((ret = __wt_btree_stat_init(session, cst)) == 0) {
+ cst->u.dsrc_stats = dhandle->stats;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ __wt_stat_refresh_dsrc_stats(&dhandle->stats);
+ __wt_curstat_dsrc_final(cst);
+ }
+
+ /* Release the handle, we're done with it. */
+ WT_TRET(__wt_session_release_btree(session));
+ WT_RET(ret);
+
+ /*
+ * If no checkpoint was specified, review the open handles and aggregate
+ * the statistics from any checkpoint handles matching this file.
+ */
+ if (dhandle->checkpoint == NULL) {
+ args.name = dhandle->name;
+ args.stats = &cst->u.dsrc_stats;
+ args.clear = F_ISSET(cst, WT_CONN_STAT_CLEAR);
+ cfg_arg[0] = (char *)&args;
+
+ /*
+ * We're likely holding the schema lock inside the statistics
+ * logging thread, not to mention calling __wt_conn_btree_apply
+ * from there as well. Save/restore the handle.
+ */
+ saved_dhandle = dhandle;
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_conn_btree_apply(
+ session, 1, __curstat_checkpoint, cfg_arg));
+ session->dhandle = saved_dhandle;
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_curstat_dsrc_final --
+ * Finalize a data-source statistics cursor.
+ */
+void
+__wt_curstat_dsrc_final(WT_CURSOR_STAT *cst)
+{
+
+ cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats;
+ cst->stats_base = WT_DSRC_STATS_BASE;
+ cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+}
+
+/*
+ * __wt_curstat_init --
+ * Initialize a statistics cursor.
+ */
+int
+__wt_curstat_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ const char *dsrc_uri;
+
+ cst->notpositioned = 1;
+
+ if (strcmp(uri, "statistics:") == 0) {
+ __curstat_conn_init(session, cst);
+ return (0);
+ }
+
+ dsrc_uri = uri + strlen("statistics:");
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:"))
+ return (
+ __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "file:"))
+ return (__curstat_file_init(session, dsrc_uri, cfg, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "index:"))
+ return (__wt_curstat_index_init(session, dsrc_uri, cfg, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "lsm:"))
+ return (__wt_curstat_lsm_init(session, dsrc_uri, cst));
+
+ if (WT_PREFIX_MATCH(dsrc_uri, "table:"))
+ return (__wt_curstat_table_init(session, dsrc_uri, cfg, cst));
+
+ return (__wt_bad_object_type(session, uri));
+}
+
+/*
+ * __wt_curstat_open --
+ * WT_SESSION->open_cursor method for the statistics cursor type.
+ */
+int
+__wt_curstat_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR_STATIC_INIT(iface,
+ __curstat_get_key, /* get-key */
+ __curstat_get_value, /* get-value */
+ __curstat_set_key, /* set-key */
+ __curstat_set_value, /* set-value */
+ __wt_cursor_notsup, /* compare */
+ __curstat_next, /* next */
+ __curstat_prev, /* prev */
+ __curstat_reset, /* reset */
+ __curstat_search, /* search */
+ __wt_cursor_notsup, /* search-near */
+ __wt_cursor_notsup, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __curstat_close); /* close */
+ WT_CONFIG_ITEM cval, sval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_STAT *cst;
+ WT_DECL_RET;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_STAT, iface) == 0);
+
+ conn = S2C(session);
+
+ WT_ERR(__wt_calloc_def(session, 1, &cst));
+ cursor = &cst->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+
+ /*
+ * Statistics cursor configuration: must match (and defaults to), the
+ * database configuration.
+ */
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_NONE))
+ goto config_err;
+ if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) == 0) {
+ if ((ret = __wt_config_subgets(
+ session, &cval, "all", &sval)) == 0 && sval.val != 0) {
+ if (!FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL))
+ goto config_err;
+ F_SET(cst, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret = __wt_config_subgets(
+ session, &cval, "fast", &sval)) == 0 && sval.val != 0) {
+ if (F_ISSET(cst, WT_CONN_STAT_ALL))
+ WT_ERR_MSG(session, EINVAL,
+ "only one statistics configuration value "
+ "may be specified");
+ F_SET(cst, WT_CONN_STAT_FAST);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret = __wt_config_subgets(
+ session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+ F_SET(cst, WT_CONN_STAT_CLEAR);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* If no configuration, use the connection's configuration. */
+ if (cst->flags == 0) {
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ALL))
+ F_SET(cst, WT_CONN_STAT_ALL);
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_FAST))
+ F_SET(cst, WT_CONN_STAT_FAST);
+ }
+
+ /* If the connection configures clear, so do we. */
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ F_SET(cst, WT_CONN_STAT_CLEAR);
+ }
+
+ /*
+ * We return the statistics field's offset as the key, and a string
+ * description, a string value, and a uint64_t value as the value
+ * columns.
+ */
+ cursor->key_format = "i";
+ cursor->value_format = "SSq";
+ WT_ERR(__wt_curstat_init(session, uri, cfg, cst));
+
+ /* __wt_cursor_init is last so we don't have to clean up on error. */
+ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+
+ if (0) {
+config_err: WT_ERR_MSG(session, EINVAL,
+ "cursor's statistics configuration doesn't match the "
+ "database statistics configuration");
+ }
+
+ if (0) {
+err: __wt_free(session, cst);
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_std.c b/src/third_party/wiredtiger/src/cursor/cur_std.c
new file mode 100644
index 00000000000..21d676d943a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_std.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cursor_notsup --
+ * Unsupported cursor actions.
+ */
+int
+__wt_cursor_notsup(WT_CURSOR *cursor)
+{
+ WT_UNUSED(cursor);
+
+ return (ENOTSUP);
+}
+
+/*
+ * __wt_cursor_noop --
+ * Cursor noop.
+ */
+int
+__wt_cursor_noop(WT_CURSOR *cursor)
+{
+ WT_UNUSED(cursor);
+
+ return (0);
+}
+
+/*
+ * __wt_cursor_set_notsup --
+ * Reset the cursor methods to not-supported.
+ */
+void
+__wt_cursor_set_notsup(WT_CURSOR *cursor)
+{
+ /*
+ * Set all of the cursor methods (except for close and reset), to fail.
+ * Close is unchanged so the cursor can be discarded, reset defaults to
+ * a no-op because session transactional operations reset all of the
+ * cursors in a session, and random cursors shouldn't block transactions
+ * or checkpoints.
+ */
+ cursor->compare =
+ (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup;
+ cursor->next = __wt_cursor_notsup;
+ cursor->prev = __wt_cursor_notsup;
+ cursor->reset = __wt_cursor_noop;
+ cursor->search = __wt_cursor_notsup;
+ cursor->search_near = (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup;
+ cursor->insert = __wt_cursor_notsup;
+ cursor->update = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+}
+
+/*
+ * __wt_cursor_config_readonly --
+ * Parse read only configuration and setup cursor appropriately.
+ */
+int
+__wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def)
+{
+ WT_CONFIG_ITEM cval;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "readonly", def, &cval));
+ if (cval.val != 0) {
+ /* Reset all cursor methods that could modify data. */
+ cursor->insert = __wt_cursor_notsup;
+ cursor->update = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+ }
+ return (0);
+}
+
+/*
+ * __wt_cursor_kv_not_set --
+ * Standard error message for key/values not set.
+ */
+int
+__wt_cursor_kv_not_set(WT_CURSOR *cursor, int key)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_RET_MSG(session,
+ cursor->saved_err == 0 ? EINVAL : cursor->saved_err,
+ "requires %s be set", key ? "key" : "value");
+}
+
+/*
+ * __wt_cursor_get_key --
+ * WT_CURSOR->get_key default implementation.
+ */
+int
+__wt_cursor_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, cursor);
+ ret = __wt_cursor_get_keyv(cursor, cursor->flags, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_key --
+ * WT_CURSOR->set_key default implementation.
+ */
+void
+__wt_cursor_set_key(WT_CURSOR *cursor, ...)
+{
+ va_list ap;
+
+ va_start(ap, cursor);
+ __wt_cursor_set_keyv(cursor, cursor->flags, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_cursor_get_raw_key --
+ * Temporarily force raw mode in a cursor to get a canonical copy of
+ * the key.
+ */
+int
+__wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
+{
+ WT_DECL_RET;
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ ret = cursor->get_key(cursor, key);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_key --
+ * Temporarily force raw mode in a cursor to set a canonical copy of
+ * the key.
+ */
+void
+__wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key)
+{
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ cursor->set_key(cursor, key);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
+ * __wt_cursor_get_raw_value --
+ * Temporarily force raw mode in a cursor to get a canonical copy of
+ * the value.
+ */
+int
+__wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+ WT_DECL_RET;
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ ret = cursor->get_value(cursor, value);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_set_raw_value --
+ * Temporarily force raw mode in a cursor to set a canonical copy of
+ * the value.
+ */
+void
+__wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value)
+{
+ int raw_set;
+
+ raw_set = F_ISSET(cursor, WT_CURSTD_RAW) ? 1 : 0;
+ if (!raw_set)
+ F_SET(cursor, WT_CURSTD_RAW);
+ cursor->set_value(cursor, value);
+ if (!raw_set)
+ F_CLR(cursor, WT_CURSTD_RAW);
+}
+
+/*
+ * __wt_cursor_get_keyv --
+ * WT_CURSOR->get_key worker function.
+ */
+int
+__wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
+{
+ WT_DECL_RET;
+ WT_ITEM *key;
+ WT_SESSION_IMPL *session;
+ size_t size;
+ const char *fmt;
+
+ CURSOR_API_CALL(cursor, session, get_key, NULL);
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT))
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 1));
+
+ if (WT_CURSOR_RECNO(cursor)) {
+ if (LF_ISSET(WT_CURSTD_RAW)) {
+ key = va_arg(ap, WT_ITEM *);
+ key->data = cursor->raw_recno_buf;
+ WT_ERR(__wt_struct_size(
+ session, &size, "q", cursor->recno));
+ key->size = size;
+ ret = __wt_struct_pack(session, cursor->raw_recno_buf,
+ sizeof(cursor->raw_recno_buf), "q", cursor->recno);
+ } else
+ *va_arg(ap, uint64_t *) = cursor->recno;
+ } else {
+ /* Fast path some common cases. */
+ fmt = cursor->key_format;
+ if (LF_ISSET(WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) {
+ key = va_arg(ap, WT_ITEM *);
+ key->data = cursor->key.data;
+ key->size = cursor->key.size;
+ } else if (WT_STREQ(fmt, "S"))
+ *va_arg(ap, const char **) = cursor->key.data;
+ else
+ ret = __wt_struct_unpackv(session,
+ cursor->key.data, cursor->key.size, fmt, ap);
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_cursor_set_keyv --
+ * WT_CURSOR->set_key default implementation.
+ */
+void
+__wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_ITEM *buf, *item;
+ size_t sz;
+ va_list ap_copy;
+ const char *fmt, *str;
+
+ CURSOR_API_CALL(cursor, session, set_key, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET);
+
+ if (WT_CURSOR_RECNO(cursor)) {
+ if (LF_ISSET(WT_CURSTD_RAW)) {
+ item = va_arg(ap, WT_ITEM *);
+ WT_ERR(__wt_struct_unpack(session,
+ item->data, item->size, "q", &cursor->recno));
+ } else
+ cursor->recno = va_arg(ap, uint64_t);
+ if (cursor->recno == 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Record numbers must be greater than zero");
+ cursor->key.data = &cursor->recno;
+ sz = sizeof(cursor->recno);
+ } else {
+ /* Fast path some common cases and special case WT_ITEMs. */
+ fmt = cursor->key_format;
+ if (LF_ISSET(WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) ||
+ WT_STREQ(fmt, "u")) {
+ item = va_arg(ap, WT_ITEM *);
+ sz = item->size;
+ cursor->key.data = item->data;
+ } else if (WT_STREQ(fmt, "S")) {
+ str = va_arg(ap, const char *);
+ sz = strlen(str) + 1;
+ cursor->key.data = (void *)str;
+ } else {
+ buf = &cursor->key;
+
+ va_copy(ap_copy, ap);
+ ret = __wt_struct_sizev(
+ session, &sz, cursor->key_format, ap_copy);
+ va_end(ap_copy);
+ WT_ERR(ret);
+
+ WT_ERR(__wt_buf_initsize(session, buf, sz));
+ WT_ERR(__wt_struct_packv(
+ session, buf->mem, sz, cursor->key_format, ap));
+ }
+ }
+ if (sz == 0)
+ WT_ERR_MSG(session, EINVAL, "Empty keys not permitted");
+ else if ((uint32_t)sz != sz)
+ WT_ERR_MSG(session, EINVAL,
+ "Key size (%" PRIu64 ") out of range", (uint64_t)sz);
+ cursor->saved_err = 0;
+ cursor->key.size = sz;
+ F_SET(cursor, WT_CURSTD_KEY_EXT);
+ if (0) {
+err: cursor->saved_err = ret;
+ }
+
+ API_END(session, ret);
+}
+
+/*
+ * __wt_cursor_get_value --
+ * WT_CURSOR->get_value default implementation.
+ */
+int
+__wt_cursor_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, cursor);
+ ret = __wt_cursor_get_valuev(cursor, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __wt_cursor_get_valuev --
+ * WT_CURSOR->get_value worker implementation.
+ */
+int
+__wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap)
+{
+ WT_DECL_RET;
+ WT_ITEM *value;
+ WT_SESSION_IMPL *session;
+ const char *fmt;
+
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+
+ if (!F_ISSET(cursor, WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT))
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 0));
+
+ /* Fast path some common cases. */
+ fmt = cursor->value_format;
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) {
+ value = va_arg(ap, WT_ITEM *);
+ value->data = cursor->value.data;
+ value->size = cursor->value.size;
+ } else if (WT_STREQ(fmt, "S"))
+ *va_arg(ap, const char **) = cursor->value.data;
+ else if (WT_STREQ(fmt, "t") ||
+ (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t")))
+ *va_arg(ap, uint8_t *) = *(uint8_t *)cursor->value.data;
+ else
+ ret = __wt_struct_unpackv(session,
+ cursor->value.data, cursor->value.size, fmt, ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_cursor_set_value --
+ * WT_CURSOR->set_value default implementation.
+ */
+void
+__wt_cursor_set_value(WT_CURSOR *cursor, ...)
+{
+ va_list ap;
+
+ va_start(ap, cursor);
+ __wt_cursor_set_valuev(cursor, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_cursor_set_valuev --
+ * WT_CURSOR->set_value worker implementation.
+ */
+void
+__wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap)
+{
+ WT_DECL_RET;
+ WT_ITEM *buf, *item;
+ WT_SESSION_IMPL *session;
+ const char *fmt, *str;
+ va_list ap_copy;
+ size_t sz;
+
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+
+ /* Fast path some common cases. */
+ fmt = cursor->value_format;
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) ||
+ WT_STREQ(fmt, "u")) {
+ item = va_arg(ap, WT_ITEM *);
+ sz = item->size;
+ cursor->value.data = item->data;
+ } else if (WT_STREQ(fmt, "S")) {
+ str = va_arg(ap, const char *);
+ sz = strlen(str) + 1;
+ cursor->value.data = str;
+ } else if (WT_STREQ(fmt, "t") ||
+ (isdigit(fmt[0]) && WT_STREQ(fmt + 1, "t"))) {
+ sz = 1;
+ buf = &cursor->value;
+ WT_ERR(__wt_buf_initsize(session, buf, sz));
+ *(uint8_t *)buf->mem = (uint8_t)va_arg(ap, int);
+ } else {
+ va_copy(ap_copy, ap);
+ ret = __wt_struct_sizev(session,
+ &sz, cursor->value_format, ap_copy);
+ va_end(ap_copy);
+ WT_ERR(ret);
+ buf = &cursor->value;
+ WT_ERR(__wt_buf_initsize(session, buf, sz));
+ WT_ERR(__wt_struct_packv(session, buf->mem, sz,
+ cursor->value_format, ap));
+ }
+ F_SET(cursor, WT_CURSTD_VALUE_EXT);
+ cursor->value.size = sz;
+
+ if (0) {
+err: cursor->saved_err = ret;
+ }
+ API_END(session, ret);
+}
+
+/*
+ * __wt_cursor_close --
+ * WT_CURSOR->close default implementation.
+ */
+int
+__wt_cursor_close(WT_CURSOR *cursor)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_buf_free(session, &cursor->key);
+ __wt_buf_free(session, &cursor->value);
+
+ if (F_ISSET(cursor, WT_CURSTD_OPEN)) {
+ TAILQ_REMOVE(&session->cursors, cursor, q);
+
+ WT_STAT_FAST_DATA_DECR(session, session_cursor_open);
+ WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open);
+ }
+
+ __wt_free(session, cursor->internal_uri);
+ __wt_free(session, cursor->uri);
+ __wt_overwrite_and_free(session, cursor);
+ return (ret);
+}
+
+/*
+ * __cursor_runtime_config --
+ * Set runtime-configurable settings.
+ */
+static int
+__cursor_runtime_config(WT_CURSOR *cursor, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ /*
+ * !!!
+ * There's no way yet to reconfigure cursor flags at runtime; if, in
+ * the future there is a way to do that, similar support needs to be
+ * added for data-source cursors, or, this call needs to return an
+ * error in the case of a data-source cursor.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "overwrite", 1, &cval));
+ if (cval.val)
+ F_SET(cursor, WT_CURSTD_OVERWRITE);
+ else
+ F_CLR(cursor, WT_CURSTD_OVERWRITE);
+
+ return (0);
+}
+
+/*
+ * __wt_cursor_dup_position --
+ * Set a cursor to another cursor's position.
+ */
+int
+__wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor)
+{
+ WT_ITEM key;
+
+ /*
+ * Get a copy of the cursor's raw key, and set it in the new cursor,
+ * then search for that key to position the cursor.
+ *
+ * We don't clear the WT_ITEM structure: all that happens when getting
+ * and setting the key is the data/size fields are reset to reference
+ * the original cursor's key.
+ *
+ * That said, we're playing games with the cursor flags: setting the key
+ * sets the key/value application-set flags in the new cursor, which may
+ * or may not be correct, but there's nothing simple that fixes it. We
+ * depend on the subsequent cursor search to clean things up, as search
+ * is required to copy and/or reference private memory after success.
+ */
+ WT_RET(__wt_cursor_get_raw_key(to_dup, &key));
+ __wt_cursor_set_raw_key(cursor, &key);
+
+ /*
+ * We now have a reference to the raw key, but we don't know anything
+ * about the memory in which it's stored, it could be btree/file page
+ * memory in the cache, application memory or the original cursor's
+ * key/value WT_ITEMs. Memory allocated in support of another cursor
+ * could be discarded when that cursor is closed, so it's a problem.
+ * However, doing a search to position the cursor will fix the problem:
+ * cursors cannot reference application memory after cursor operations
+ * and that requirement will save the day.
+ */
+ WT_RET(cursor->search(cursor));
+
+ return (0);
+}
+
+/*
+ * __wt_cursor_init --
+ * Default cursor initialization.
+ */
+int
+__wt_cursor_init(WT_CURSOR *cursor,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cdump;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ if (cursor->internal_uri == NULL)
+ WT_RET(__wt_strdup(session, uri, &cursor->internal_uri));
+
+ /* Set runtime-configurable settings. */
+ WT_RET(__cursor_runtime_config(cursor, cfg));
+
+ /*
+ * append
+ * The append flag is only relevant to column stores.
+ */
+ if (WT_CURSOR_RECNO(cursor)) {
+ WT_RET(__wt_config_gets_def(session, cfg, "append", 0, &cval));
+ if (cval.val != 0)
+ F_SET(cursor, WT_CURSTD_APPEND);
+ }
+
+ /*
+ * checkpoint
+ * Checkpoint cursors are read-only.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0) {
+ cursor->insert = __wt_cursor_notsup;
+ cursor->update = __wt_cursor_notsup;
+ cursor->remove = __wt_cursor_notsup;
+ }
+
+ /*
+ * dump
+ * If an index cursor is opened with dump, then this
+ * function is called on the index files, with the dump
+ * config string, and with the index cursor as an owner.
+ * We don't want to create a dump cursor in that case, because
+ * we'll create the dump cursor on the index cursor itself.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "dump", 0, &cval));
+ if (cval.len != 0 && owner == NULL) {
+ F_SET(cursor,
+ WT_STRING_MATCH("json", cval.str, cval.len) ?
+ WT_CURSTD_DUMP_JSON :
+ (WT_STRING_MATCH("print", cval.str, cval.len) ?
+ WT_CURSTD_DUMP_PRINT : WT_CURSTD_DUMP_HEX));
+ /*
+ * Dump cursors should not have owners: only the
+ * top-level cursor should be wrapped in a dump cursor.
+ */
+ WT_RET(__wt_curdump_create(cursor, owner, &cdump));
+ owner = cdump;
+ } else
+ cdump = NULL;
+
+ /* raw */
+ WT_RET(__wt_config_gets_def(session, cfg, "raw", 0, &cval));
+ if (cval.val != 0)
+ F_SET(cursor, WT_CURSTD_RAW);
+
+ /* readonly */
+ WT_RET(__wt_cursor_config_readonly(cursor, cfg, 0));
+
+ /*
+ * Cursors that are internal to some other cursor (such as file cursors
+ * inside a table cursor) should be closed after the containing cursor.
+ * Arrange for that to happen by putting internal cursors after their
+ * owners on the queue.
+ */
+ if (owner != NULL) {
+ WT_ASSERT(session, F_ISSET(owner, WT_CURSTD_OPEN));
+ TAILQ_INSERT_AFTER(&session->cursors, owner, cursor, q);
+ } else
+ TAILQ_INSERT_HEAD(&session->cursors, cursor, q);
+
+ F_SET(cursor, WT_CURSTD_OPEN);
+ WT_STAT_FAST_DATA_INCR(session, session_cursor_open);
+ WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open);
+
+ *cursorp = (cdump != NULL) ? cdump : cursor;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/cursor/cur_table.c b/src/third_party/wiredtiger/src/cursor/cur_table.c
new file mode 100644
index 00000000000..ea267f96f9c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/cursor/cur_table.c
@@ -0,0 +1,808 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __curtable_open_indices(WT_CURSOR_TABLE *ctable);
+static int __curtable_update(WT_CURSOR *cursor);
+
+#define APPLY_CG(ctable, f) do { \
+ WT_CURSOR **__cp; \
+ u_int __i; \
+ for (__i = 0, __cp = ctable->cg_cursors; \
+ __i < WT_COLGROUPS(ctable->table); \
+ __i++, __cp++) \
+ WT_TRET((*__cp)->f(*__cp)); \
+} while (0)
+
+#define APPLY_IDX(ctable, f) do { \
+ WT_INDEX *idx; \
+ WT_CURSOR **__cp; \
+ u_int __i; \
+ __cp = (ctable)->idx_cursors; \
+ for (__i = 0; __i < ctable->table->nindices; __i++, __cp++) { \
+ idx = ctable->table->indices[__i]; \
+ WT_ERR(__wt_schema_project_merge(session, \
+ ctable->cg_cursors, \
+ idx->key_plan, idx->key_format, &(*__cp)->key)); \
+ F_SET(*__cp, WT_CURSTD_KEY_EXT | \
+ WT_CURSTD_VALUE_EXT); \
+ WT_ERR((*__cp)->f(*__cp)); \
+ WT_ERR((*__cp)->reset(*__cp)); \
+ } \
+} while (0)
+
+/*
+ * __wt_curtable_get_key --
+ * WT_CURSOR->get_key implementation for tables.
+ */
+int
+__wt_curtable_get_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR *primary;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ va_list ap;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ primary = *ctable->cg_cursors;
+
+ va_start(ap, cursor);
+ ret = __wt_cursor_get_keyv(primary, cursor->flags, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_curtable_get_value --
+ * WT_CURSOR->get_value implementation for tables.
+ */
+int
+__wt_curtable_get_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR *primary;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ primary = *ctable->cg_cursors;
+ CURSOR_API_CALL(cursor, session, get_value, NULL);
+ WT_CURSOR_NEEDVALUE(primary);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) {
+ ret = __wt_schema_project_merge(session,
+ ctable->cg_cursors, ctable->plan,
+ cursor->value_format, &cursor->value);
+ if (ret == 0) {
+ item = va_arg(ap, WT_ITEM *);
+ item->data = cursor->value.data;
+ item->size = cursor->value.size;
+ }
+ } else
+ ret = __wt_schema_project_out(session,
+ ctable->cg_cursors, ctable->plan, ap);
+ va_end(ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_curtable_set_key --
+ * WT_CURSOR->set_key implementation for tables.
+ */
+void
+__wt_curtable_set_key(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR **cp, *primary;
+ WT_CURSOR_TABLE *ctable;
+ va_list ap;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ cp = ctable->cg_cursors;
+ primary = *cp++;
+
+ va_start(ap, cursor);
+ __wt_cursor_set_keyv(primary, cursor->flags, ap);
+ va_end(ap);
+
+ if (!F_ISSET(primary, WT_CURSTD_KEY_SET))
+ return;
+
+ /* Copy the primary key to the other cursors. */
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->recno = primary->recno;
+ (*cp)->key.data = primary->key.data;
+ (*cp)->key.size = primary->key.size;
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ }
+}
+
+/*
+ * __wt_curtable_set_value --
+ * WT_CURSOR->set_value implementation for tables.
+ */
+void
+__wt_curtable_set_value(WT_CURSOR *cursor, ...)
+{
+ WT_CURSOR **cp;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_ITEM *item;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, set_value, NULL);
+
+ va_start(ap, cursor);
+ if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) {
+ item = va_arg(ap, WT_ITEM *);
+ cursor->value.data = item->data;
+ cursor->value.size = item->size;
+ ret = __wt_schema_project_slice(session,
+ ctable->cg_cursors, ctable->plan, 0,
+ cursor->value_format, &cursor->value);
+ } else
+ ret = __wt_schema_project_in(session,
+ ctable->cg_cursors, ctable->plan, ap);
+ va_end(ap);
+
+ for (i = 0, cp = ctable->cg_cursors;
+ i < WT_COLGROUPS(ctable->table); i++, cp++)
+ if (ret == 0)
+ F_SET(*cp, WT_CURSTD_VALUE_EXT);
+ else {
+ (*cp)->saved_err = ret;
+ F_CLR(*cp, WT_CURSTD_VALUE_SET);
+ }
+
+err: API_END(session, ret);
+}
+
+/*
+ * __curtable_compare --
+ * WT_CURSOR->compare implementation for tables.
+ */
+static int
+__curtable_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * call the underlying object's comparison routine.
+ */
+ if (strcmp(a->internal_uri, b->internal_uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "comparison method cursors must reference the same object");
+ WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(a));
+ WT_CURSOR_CHECKKEY(WT_CURSOR_PRIMARY(b));
+
+ ret = WT_CURSOR_PRIMARY(a)->compare(
+ WT_CURSOR_PRIMARY(a), WT_CURSOR_PRIMARY(b), cmpp);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_next --
+ * WT_CURSOR->next method for the table cursor type.
+ */
+static int
+__curtable_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ APPLY_CG(ctable, next);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_next_random --
+ * WT_CURSOR->next method for the table cursor type when configured with
+ * next_random.
+ */
+static int
+__curtable_next_random(WT_CURSOR *cursor)
+{
+ WT_CURSOR *primary, **cp;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ cp = ctable->cg_cursors;
+
+ /* Split out the first next, it retrieves the random record. */
+ primary = *cp++;
+ WT_ERR(primary->next(primary));
+
+ /* Fill in the rest of the columns. */
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->key.data = primary->key.data;
+ (*cp)->key.size = primary->key.size;
+ (*cp)->recno = primary->recno;
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ WT_ERR((*cp)->search(*cp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_prev --
+ * WT_CURSOR->prev method for the table cursor type.
+ */
+static int
+__curtable_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ APPLY_CG(ctable, prev);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_reset --
+ * WT_CURSOR->reset method for the table cursor type.
+ */
+static int
+__curtable_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+ APPLY_CG(ctable, reset);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_search --
+ * WT_CURSOR->search method for the table cursor type.
+ */
+static int
+__curtable_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, search, NULL);
+ APPLY_CG(ctable, search);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_search_near --
+ * WT_CURSOR->search_near method for the table cursor type.
+ */
+static int
+__curtable_search_near(WT_CURSOR *cursor, int *exact)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_CURSOR *primary, **cp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ cp = ctable->cg_cursors;
+ primary = *cp;
+ WT_ERR(primary->search_near(primary, exact));
+
+ for (i = 1, ++cp; i < WT_COLGROUPS(ctable->table); i++) {
+ (*cp)->key.data = primary->key.data;
+ (*cp)->key.size = primary->key.size;
+ (*cp)->recno = primary->recno;
+ F_SET(*cp, WT_CURSTD_KEY_EXT);
+ WT_ERR((*cp)->search(*cp));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_insert --
+ * WT_CURSOR->insert method for the table cursor type.
+ */
+static int
+__curtable_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR *primary, **cp;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint32_t flag_orig;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+ WT_ERR(__curtable_open_indices(ctable));
+
+ /*
+ * Split out the first insert, it may be allocating a recno.
+ *
+ * If the table has indices, we also need to know whether this record
+ * is replacing an existing record so that the existing index entries
+ * can be removed. We discover if this is an overwrite by configuring
+ * the primary cursor for no-overwrite, and checking if the insert
+ * detects a duplicate key.
+ */
+ cp = ctable->cg_cursors;
+ primary = *cp++;
+
+ flag_orig = F_ISSET(primary, WT_CURSTD_OVERWRITE);
+ if (ctable->table->nindices > 0)
+ F_CLR(primary, WT_CURSTD_OVERWRITE);
+ ret = primary->insert(primary);
+ F_SET(primary, flag_orig);
+
+ if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
+ /*
+ * !!!
+ * The insert failure clears these flags, but does not touch the
+ * items. We could make a copy each time for overwrite cursors,
+ * but for now we just reset the flags.
+ */
+ F_SET(primary, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);
+ ret = __curtable_update(cursor);
+ goto err;
+ }
+ WT_ERR(ret);
+
+ for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) {
+ (*cp)->recno = primary->recno;
+ WT_ERR((*cp)->insert(*cp));
+ }
+
+ APPLY_IDX(ctable, insert);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curtable_update --
+ * WT_CURSOR->update method for the table cursor type.
+ */
+static int
+__curtable_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+ WT_ERR(__curtable_open_indices(ctable));
+
+ /*
+ * If the table has indices, first delete any old index keys, then
+ * update the primary, then insert the new index keys. This is
+ * complicated by the fact that we need the old value to generate the
+ * old index keys, so we make a temporary copy of the new value.
+ */
+ if (ctable->table->nindices > 0) {
+ WT_ERR(__wt_schema_project_merge(session,
+ ctable->cg_cursors, ctable->plan,
+ cursor->value_format, &cursor->value));
+ APPLY_CG(ctable, search);
+ /*
+ * Remove only if the key exists.
+ */
+ if (ret == 0) {
+ APPLY_IDX(ctable, remove);
+ WT_ERR(__wt_schema_project_slice(session,
+ ctable->cg_cursors, ctable->plan, 0,
+ cursor->value_format, &cursor->value));
+ } else if (ret == WT_NOTFOUND)
+ ret = 0;
+ else
+ WT_ERR(ret);
+ }
+ APPLY_CG(ctable, update);
+ WT_ERR(ret);
+ if (ctable->idx_cursors != NULL)
+ APPLY_IDX(ctable, insert);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __curtable_remove --
+ * WT_CURSOR->remove method for the table cursor type.
+ */
+static int
+__curtable_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ WT_ERR(__curtable_open_indices(ctable));
+
+ /* Find the old record so it can be removed from indices */
+ if (ctable->table->nindices > 0) {
+ APPLY_CG(ctable, search);
+ WT_ERR(ret);
+ APPLY_IDX(ctable, remove);
+ }
+
+ APPLY_CG(ctable, remove);
+
+err: CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __wt_table_range_truncate --
+ * Truncate of a cursor range, table implementation.
+ */
+int
+__wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop)
+{
+ WT_CURSOR *wt_start, *wt_stop;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_ITEM raw;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp;
+
+ ctable = (start != NULL) ? start : stop;
+ session = (WT_SESSION_IMPL *)ctable->iface.session;
+ wt_start = &start->iface;
+ wt_stop = &stop->iface;
+
+ /* Open any indices. */
+ WT_RET(__curtable_open_indices(ctable));
+ WT_RET(__wt_scr_alloc(session, 128, &key));
+
+ /*
+ * Step through the cursor range, removing the index entries.
+ *
+ * If there are indices, copy the key we're using to step through the
+ * cursor range (so we can reset the cursor to its original position),
+ * then remove all of the index records in the truncated range. Copy
+ * the raw key because the memory is only valid until the cursor moves.
+ */
+ if (ctable->table->nindices > 0) {
+ if (start == NULL) {
+ WT_ERR(__wt_cursor_get_raw_key(wt_stop, &raw));
+ WT_ERR(__wt_buf_set(session, key, raw.data, raw.size));
+
+ do {
+ APPLY_CG(stop, search);
+ WT_ERR(ret);
+ APPLY_IDX(stop, remove);
+ } while ((ret = wt_stop->prev(wt_stop)) == 0);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ __wt_cursor_set_raw_key(wt_stop, key);
+ APPLY_CG(stop, search);
+ } else {
+ WT_ERR(__wt_cursor_get_raw_key(wt_start, &raw));
+ WT_ERR(__wt_buf_set(session, key, raw.data, raw.size));
+
+ cmp = -1;
+ do {
+ APPLY_CG(start, search);
+ WT_ERR(ret);
+ APPLY_IDX(start, remove);
+ if (stop != NULL)
+ WT_ERR(wt_start->compare(
+ wt_start, wt_stop,
+ &cmp));
+ } while (cmp < 0 &&
+ (ret = wt_start->next(wt_start)) == 0);
+ WT_ERR_NOTFOUND_OK(ret);
+
+ __wt_cursor_set_raw_key(wt_start, key);
+ APPLY_CG(start, search);
+ }
+ }
+
+ /* Truncate the column groups. */
+ for (i = 0; i < WT_COLGROUPS(ctable->table); i++)
+ WT_ERR(__wt_range_truncate(
+ (start == NULL) ? NULL : start->cg_cursors[i],
+ (stop == NULL) ? NULL : stop->cg_cursors[i]));
+
+err: __wt_scr_free(&key);
+ return (ret);
+}
+
+/*
+ * __curtable_close --
+ * WT_CURSOR->close method for the table cursor type.
+ */
+static int
+__curtable_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_TABLE *ctable;
+ WT_CURSOR **cp;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ ctable = (WT_CURSOR_TABLE *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+
+ for (i = 0, cp = ctable->cg_cursors;
+ i < WT_COLGROUPS(ctable->table); i++, cp++)
+ if (*cp != NULL) {
+ WT_TRET((*cp)->close(*cp));
+ *cp = NULL;
+ }
+
+ if (ctable->idx_cursors != NULL)
+ for (i = 0, cp = ctable->idx_cursors;
+ i < ctable->table->nindices; i++, cp++)
+ if (*cp != NULL) {
+ WT_TRET((*cp)->close(*cp));
+ *cp = NULL;
+ }
+
+ if (ctable->plan != ctable->table->plan)
+ __wt_free(session, ctable->plan);
+ for (i = 0; ctable->cfg[i] != NULL; ++i)
+ __wt_free(session, ctable->cfg[i]);
+ __wt_free(session, ctable->cfg);
+ if (cursor->value_format != ctable->table->value_format)
+ __wt_free(session, cursor->value_format);
+ __wt_free(session, ctable->cg_cursors);
+ __wt_free(session, ctable->idx_cursors);
+ __wt_schema_release_table(session, ctable->table);
+ /* The URI is owned by the table. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curtable_open_colgroups --
+ * Open cursors on column groups for a table cursor.
+ */
+static int
+__curtable_open_colgroups(WT_CURSOR_TABLE *ctable, const char *cfg_arg[])
+{
+ WT_SESSION_IMPL *session;
+ WT_TABLE *table;
+ WT_CURSOR **cp;
+ /*
+ * Underlying column groups are always opened without dump, and only
+ * the primary is opened with next_random.
+ */
+ const char *cfg[] = {
+ cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL, NULL
+ };
+ u_int i;
+
+ session = (WT_SESSION_IMPL *)ctable->iface.session;
+ table = ctable->table;
+
+ if (!table->cg_complete)
+ WT_RET_MSG(session, EINVAL,
+ "Can't use '%s' until all column groups are created",
+ table->name);
+
+ WT_RET(__wt_calloc_def(session,
+ WT_COLGROUPS(table), &ctable->cg_cursors));
+
+ for (i = 0, cp = ctable->cg_cursors;
+ i < WT_COLGROUPS(table);
+ i++, cp++) {
+ WT_RET(__wt_open_cursor(session, table->cgroups[i]->source,
+ &ctable->iface, cfg, cp));
+ cfg[3] = "next_random=false";
+ }
+ return (0);
+}
+
+/*
+ * __curtable_open_indices --
+ * Open cursors on indices for a table cursor.
+ */
+static int
+__curtable_open_indices(WT_CURSOR_TABLE *ctable)
+{
+ WT_CURSOR **cp, *primary;
+ WT_SESSION_IMPL *session;
+ WT_TABLE *table;
+ u_int i;
+
+ session = (WT_SESSION_IMPL *)ctable->iface.session;
+ table = ctable->table;
+
+ WT_RET(__wt_schema_open_indices(session, table));
+ if (table->nindices == 0 || ctable->idx_cursors != NULL)
+ return (0);
+
+ /* Check for bulk cursors. */
+ primary = *ctable->cg_cursors;
+ if (F_ISSET(primary, WT_CURSTD_BULK))
+ WT_RET_MSG(session, ENOTSUP,
+ "Bulk load is not supported for tables with indices");
+
+ WT_RET(__wt_calloc_def(session, table->nindices, &ctable->idx_cursors));
+ for (i = 0, cp = ctable->idx_cursors; i < table->nindices; i++, cp++)
+ WT_RET(__wt_open_cursor(session, table->indices[i]->source,
+ &ctable->iface, ctable->cfg, cp));
+ return (0);
+}
+
+/*
+ * __wt_curtable_open --
+ * WT_SESSION->open_cursor method for table cursors.
+ */
+int
+__wt_curtable_open(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_curtable_get_key, /* get-key */
+ __wt_curtable_get_value, /* get-value */
+ __wt_curtable_set_key, /* set-key */
+ __wt_curtable_set_value, /* set-value */
+ __curtable_compare, /* compare */
+ __curtable_next, /* next */
+ __curtable_prev, /* prev */
+ __curtable_reset, /* reset */
+ __curtable_search, /* search */
+ __curtable_search_near, /* search-near */
+ __curtable_insert, /* insert */
+ __curtable_update, /* update */
+ __curtable_remove, /* remove */
+ __curtable_close); /* close */
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR *cursor;
+ WT_CURSOR_TABLE *ctable;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_TABLE *table;
+ size_t size;
+ int cfg_cnt;
+ const char *tablename, *columns;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_TABLE, iface) == 0);
+
+ ctable = NULL;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "table:"))
+ return (EINVAL);
+ columns = strchr(tablename, '(');
+ if (columns == NULL)
+ size = strlen(tablename);
+ else
+ size = WT_PTRDIFF(columns, tablename);
+ WT_RET(__wt_schema_get_table(session, tablename, size, 0, &table));
+
+ if (table->is_simple) {
+ /* Just return a cursor on the underlying data source. */
+ ret = __wt_open_cursor(session,
+ table->cgroups[0]->source, NULL, cfg, cursorp);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+ }
+
+ WT_RET(__wt_calloc_def(session, 1, &ctable));
+
+ cursor = &ctable->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->internal_uri = table->name;
+ cursor->key_format = table->key_format;
+ cursor->value_format = table->value_format;
+
+ ctable->table = table;
+ ctable->plan = table->plan;
+
+ /* Handle projections. */
+ if (columns != NULL) {
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_struct_reformat(session, table,
+ columns, strlen(columns), NULL, 1, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &cursor->value_format));
+
+ WT_ERR(__wt_buf_init(session, tmp, 0));
+ WT_ERR(__wt_struct_plan(session, table,
+ columns, strlen(columns), 0, tmp));
+ WT_ERR(__wt_strndup(
+ session, tmp->data, tmp->size, &ctable->plan));
+ }
+
+ /*
+ * random_retrieval
+ * Random retrieval cursors only support next, reset and close.
+ */
+ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
+ if (cval.val != 0) {
+ __wt_cursor_set_notsup(cursor);
+ cursor->next = __curtable_next_random;
+ cursor->reset = __curtable_reset;
+ }
+
+ WT_ERR(__wt_cursor_init(
+ cursor, cursor->internal_uri, NULL, cfg, cursorp));
+
+ if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
+ WT_ERR(__wt_json_column_init(cursor, table->key_format,
+ NULL, &table->colconf));
+
+ /*
+ * Open the colgroup cursors immediately: we're going to need them for
+ * any operation. We defer opening index cursors until we need them
+ * for an update. Note that this must come after the call to
+ * __wt_cursor_init: the table cursor must already be on the list of
+ * session cursors or we can't work out where to put the colgroup
+ * cursor(s).
+ */
+ WT_ERR(__curtable_open_colgroups(ctable, cfg));
+
+ /*
+ * We'll need to squirrel away a copy of the cursor configuration
+ * for if/when we open indices.
+ *
+ * cfg[0] is the baseline configuration for the cursor open and we can
+ * acquire another copy from the configuration structures, so it would
+ * be reasonable not to copy it here: but I'd rather be safe than sorry.
+ *
+ * Underlying indices are always opened without dump.
+ */
+ for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt)
+ ;
+ WT_ERR(__wt_calloc_def(session, cfg_cnt + 2, &ctable->cfg));
+ for (cfg_cnt = 0; cfg[cfg_cnt] != NULL; ++cfg_cnt)
+ WT_ERR(
+ __wt_strdup(session, cfg[cfg_cnt], &ctable->cfg[cfg_cnt]));
+ WT_ERR(__wt_strdup(session, "dump=\"\"", &ctable->cfg[cfg_cnt]));
+
+ if (0) {
+err: WT_TRET(__curtable_close(cursor));
+ *cursorp = NULL;
+ }
+
+ __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
new file mode 100644
index 00000000000..e358d22b278
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Standard entry points to the API: declares/initializes local variables. */
+#define API_SESSION_INIT(s, h, n, cur, dh) \
+ WT_DATA_HANDLE *__olddh = (s)->dhandle; \
+ const char *__oldname = (s)->name; \
+ (s)->cursor = (cur); \
+ (s)->dhandle = (dh); \
+ (s)->name = (s)->lastop = #h "." #n; \
+
+#define API_CALL_NOCONF(s, h, n, cur, dh) do { \
+ API_SESSION_INIT(s, h, n, cur, dh); \
+ WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0); \
+ WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n))
+
+#define API_CALL(s, h, n, cur, dh, config, cfg) do { \
+ const char *cfg[] = \
+ { WT_CONFIG_BASE(s, h##_##n), config, NULL }; \
+ API_SESSION_INIT(s, h, n, cur, dh); \
+ WT_ERR(F_ISSET(S2C(s), WT_CONN_PANIC) ? __wt_panic(s) : 0); \
+ WT_ERR(((config) != NULL) ? \
+ __wt_config_check((s), \
+ WT_CONFIG_REF(session, h##_##n), (config), 0) : 0); \
+ WT_ERR(__wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n))
+
+#define API_END(s, ret) \
+ if ((s) != NULL) { \
+ (s)->dhandle = __olddh; \
+ (s)->name = __oldname; \
+ if (F_ISSET(&(s)->txn, TXN_RUNNING) && \
+ (ret) != 0 && \
+ (ret) != WT_NOTFOUND && \
+ (ret) != WT_DUPLICATE_KEY) \
+ F_SET(&(s)->txn, TXN_ERROR); \
+ } \
+} while (0)
+
+/* An API call wrapped in a transaction if necessary. */
+#define TXN_API_CALL(s, h, n, cur, bt, config, cfg) do { \
+ int __autotxn = 0; \
+ API_CALL(s, h, n, bt, cur, config, cfg); \
+ __autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING); \
+ if (__autotxn) \
+ F_SET(&(s)->txn, TXN_AUTOCOMMIT)
+
+/* An API call wrapped in a transaction if necessary. */
+#define TXN_API_CALL_NOCONF(s, h, n, cur, bt) do { \
+ int __autotxn = 0; \
+ API_CALL_NOCONF(s, h, n, cur, bt); \
+ __autotxn = !F_ISSET(&(s)->txn, TXN_AUTOCOMMIT | TXN_RUNNING); \
+ if (__autotxn) \
+ F_SET(&(s)->txn, TXN_AUTOCOMMIT)
+
+/* End a transactional API call, optional retry on deadlock. */
+#define TXN_API_END_RETRY(s, ret, retry) \
+ API_END(s, ret); \
+ if (__autotxn) { \
+ if (F_ISSET(&(s)->txn, TXN_AUTOCOMMIT)) \
+ F_CLR(&(s)->txn, TXN_AUTOCOMMIT); \
+ else if (ret == 0 && !F_ISSET(&(s)->txn, TXN_ERROR)) \
+ ret = __wt_txn_commit((s), NULL); \
+ else { \
+ WT_TRET(__wt_txn_rollback((s), NULL)); \
+ if ((ret == 0 || ret == WT_ROLLBACK) && \
+ (retry)) { \
+ ret = 0; \
+ continue; \
+ } \
+ WT_TRET(__wt_session_reset_cursors(s)); \
+ } \
+ } \
+ break; \
+} while (ret == 0)
+
+/* End a transactional API call, retry on deadlock. */
+#define TXN_API_END(s, ret) TXN_API_END_RETRY(s, ret, 1)
+
+/*
+ * In almost all cases, API_END is returning immediately, make it simple.
+ * If a session or connection method is about to return WT_NOTFOUND (some
+ * underlying object was not found), map it to ENOENT, only cursor methods
+ * return WT_NOTFOUND.
+ */
+#define API_END_RET(s, ret) \
+ API_END(s, ret); \
+ return (ret)
+#define API_END_RET_NOTFOUND_MAP(s, ret) \
+ API_END(s, ret); \
+ return ((ret) == WT_NOTFOUND ? ENOENT : (ret))
+
+#define CONNECTION_API_CALL(conn, s, n, config, cfg) \
+ s = (conn)->default_session; \
+ API_CALL(s, connection, n, NULL, NULL, config, cfg)
+
+#define CONNECTION_API_CALL_NOCONF(conn, s, n) \
+ s = (conn)->default_session; \
+ API_CALL_NOCONF(s, connection, n, NULL, NULL)
+
+#define SESSION_API_CALL(s, n, config, cfg) \
+ API_CALL(s, session, n, NULL, NULL, config, cfg)
+
+#define SESSION_API_CALL_NOCONF(s, n) \
+ API_CALL_NOCONF(s, session, n, NULL, NULL)
+
+#define SESSION_TXN_API_CALL(s, n, config, cfg) \
+ TXN_API_CALL(s, session, n, NULL, NULL, config, cfg)
+
+#define CURSOR_API_CALL(cur, s, n, bt) \
+ (s) = (WT_SESSION_IMPL *)(cur)->session; \
+ API_CALL_NOCONF(s, cursor, n, cur, \
+ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+
+#define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \
+ (s) = (WT_SESSION_IMPL *)(cur)->session; \
+ TXN_API_CALL_NOCONF(s, cursor, n, cur, \
+ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle)
+
+#define CURSOR_UPDATE_API_END(s, ret) \
+ TXN_API_END(s, ret)
+
+#define ASYNCOP_API_CALL(conn, s, n) \
+ s = (conn)->default_session; \
+ API_CALL_NOCONF(s, asyncop, n, NULL, NULL)
diff --git a/src/third_party/wiredtiger/src/include/async.h b/src/third_party/wiredtiger/src/include/async.h
new file mode 100644
index 00000000000..8565874c2f3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/async.h
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+typedef enum {
+ WT_ASYNCOP_ENQUEUED, /* Placed on the work queue */
+ WT_ASYNCOP_FREE, /* Able to be allocated to user */
+ WT_ASYNCOP_READY, /* Allocated and ready for user to use */
+ WT_ASYNCOP_WORKING /* Operation in progress by worker */
+} WT_ASYNC_STATE;
+
+typedef enum {
+ WT_ASYNC_FLUSH_NONE=0, /* No flush in progress */
+ WT_ASYNC_FLUSH_COMPLETE, /* Notify flush caller it's done */
+ WT_ASYNC_FLUSH_IN_PROGRESS, /* Prevent other callers */
+ WT_ASYNC_FLUSHING /* Notify workers */
+} WT_ASYNC_FLUSH_STATE;
+
+#define MAX_ASYNC_SLEEP_USECS 100000 /* Maximum sleep waiting for work */
+#define MAX_ASYNC_YIELD 200 /* Maximum number of yields for work */
+
+#define O2C(op) ((WT_CONNECTION_IMPL *)(op)->iface.connection)
+#define O2S(op) \
+ (((WT_CONNECTION_IMPL *)(op)->iface.connection)->default_session)
+/*
+ * WT_ASYNC_FORMAT --
+ * The URI/config/format cache.
+ */
+struct __wt_async_format {
+ STAILQ_ENTRY(__wt_async_format) q;
+ const char *config;
+ uint64_t cfg_hash; /* Config hash */
+ const char *uri;
+ uint64_t uri_hash; /* URI hash */
+ const char *key_format;
+ const char *value_format;
+};
+
+/*
+ * WT_ASYNC_OP_IMPL --
+ * Implementation of the WT_ASYNC_OP.
+ */
+struct __wt_async_op_impl {
+ WT_ASYNC_OP iface;
+
+ WT_ASYNC_CALLBACK *cb;
+
+ uint32_t internal_id; /* Array position id. */
+ uint64_t unique_id; /* Unique identifier. */
+
+ WT_ASYNC_FORMAT *format; /* Format structure */
+ WT_ASYNC_STATE state; /* Op state */
+ WT_ASYNC_OPTYPE optype; /* Operation type */
+};
+
+/*
+ * Definition of the async subsystem.
+ */
+struct __wt_async {
+ /*
+ * Ops array protected by the ops_lock.
+ */
+ WT_SPINLOCK ops_lock; /* Locked: ops array */
+ WT_ASYNC_OP_IMPL *async_ops; /* Async ops */
+#define OPS_INVALID_INDEX 0xffffffff
+ uint32_t ops_index; /* Active slot index */
+ uint64_t op_id; /* Unique ID counter */
+ WT_ASYNC_OP_IMPL **async_queue; /* Async ops work queue */
+ uint32_t async_qsize; /* Async work queue size */
+ /*
+ * We need to have two head and tail values. All but one is
+ * maintained as an ever increasing value to ease wrap around.
+ *
+ * alloc_head: the next one to allocate for producers.
+ * head: the current head visible to consumers.
+ * head is always <= alloc_head.
+ * alloc_tail: the next slot for consumers to dequeue.
+ * alloc_tail is always <= head.
+ * tail_slot: the last slot consumed.
+ * A producer may need wait for tail_slot to advance.
+ */
+ uint64_t alloc_head; /* Next slot to enqueue */
+ uint64_t head; /* Head visible to worker */
+ uint64_t alloc_tail; /* Next slot to dequeue */
+ uint64_t tail_slot; /* Worker slot consumed */
+
+ STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
+ int cur_queue; /* Currently enqueued */
+ int max_queue; /* Maximum enqueued */
+ WT_ASYNC_FLUSH_STATE flush_state; /* Queue flush state */
+ /* Notify any waiting threads when flushing is done. */
+ WT_CONDVAR *flush_cond;
+ WT_ASYNC_OP_IMPL flush_op; /* Special flush op */
+ uint32_t flush_count; /* Worker count */
+ uint64_t flush_gen; /* Flush generation number */
+
+#define WT_ASYNC_MAX_WORKERS 20
+ WT_SESSION_IMPL *worker_sessions[WT_ASYNC_MAX_WORKERS];
+ /* Async worker threads */
+ wt_thread_t worker_tids[WT_ASYNC_MAX_WORKERS];
+
+ uint32_t flags; /* Currently unused. */
+};
+
+/*
+ * WT_ASYNC_CURSOR --
+ * Async container for a cursor. Each async worker thread
+ * has a cache of async cursors to reuse for operations.
+ */
+struct __wt_async_cursor {
+ STAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */
+ uint64_t cfg_hash; /* Config hash */
+ uint64_t uri_hash; /* URI hash */
+ WT_CURSOR *c; /* WT cursor */
+};
+
+/*
+ * WT_ASYNC_WORKER_STATE --
+ * State for an async worker thread.
+ */
+struct __wt_async_worker_state {
+ uint32_t id;
+ STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh;
+ uint32_t num_cursors;
+};
diff --git a/src/third_party/wiredtiger/src/include/bitstring.i b/src/third_party/wiredtiger/src/include/bitstring.i
new file mode 100644
index 00000000000..95af6731bf9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/bitstring.i
@@ -0,0 +1,316 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Paul Vixie.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/sys/bitstring.h,v 1.5 2005/01/07 02:29:23 imp Exp $
+ */
+
+ /* byte of the bitstring bit is in */
+#define __bit_byte(bit) ((bit) >> 3)
+
+ /* mask for the bit within its byte */
+#define __bit_mask(bit) (1 << ((bit) & 0x7))
+
+ /* Bytes in a bitstring of nbits */
+#define __bitstr_size(nbits) (((nbits) + 7) >> 3)
+
+/*
+ * __bit_alloc --
+ * Allocate a bitstring.
+ */
+static inline int
+__bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp)
+{
+ return (__wt_calloc(
+ session, (size_t)__bitstr_size(nbits), sizeof(uint8_t), retp));
+}
+
+/*
+ * __bit_test --
+ * Test one bit in name.
+ */
+static inline int
+__bit_test(uint8_t *bitf, uint64_t bit)
+{
+ return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0);
+}
+
+/*
+ * __bit_set --
+ * Set one bit in name.
+ */
+static inline void
+__bit_set(uint8_t *bitf, uint64_t bit)
+{
+ bitf[__bit_byte(bit)] |= __bit_mask(bit);
+}
+
+/*
+ * __bit_clear --
+ * Clear one bit in name.
+ */
+static inline void
+__bit_clear(uint8_t *bitf, uint64_t bit)
+{
+ bitf[__bit_byte(bit)] &= ~__bit_mask(bit);
+}
+
+/*
+ * __bit_nclr --
+ * Clear bits start-to-stop in name.
+ */
+static inline void
+__bit_nclr(uint8_t *bitf, uint64_t start, uint64_t stop)
+{
+ uint64_t startbyte, stopbyte;
+
+ startbyte = __bit_byte(start);
+ stopbyte = __bit_byte(stop);
+
+ if (startbyte == stopbyte)
+ bitf[startbyte] &=
+ ((0xff >> (8 - (start & 0x7))) |
+ (0xff << ((stop & 0x7) + 1)));
+ else {
+ bitf[startbyte] &= 0xff >> (8 - (start & 0x7));
+ while (++startbyte < stopbyte)
+ bitf[startbyte] = 0;
+ bitf[stopbyte] &= 0xff << ((stop & 0x7) + 1);
+ }
+}
+
+/*
+ * __bit_nset --
+ * Set bits start-to-stop in name.
+ */
+static inline void
+__bit_nset(uint8_t *bitf, uint64_t start, uint64_t stop)
+{
+ uint64_t startbyte, stopbyte;
+
+ startbyte = __bit_byte(start);
+ stopbyte = __bit_byte(stop);
+ if (startbyte == stopbyte)
+ bitf[startbyte] |=
+ ((0xff << (start & 0x7)) & (0xff >> (7 - (stop & 0x7))));
+ else {
+ bitf[startbyte] |= 0xff << (start & 0x7);
+ while (++startbyte < stopbyte)
+ bitf[startbyte] = 0xff;
+ bitf[stopbyte] |= 0xff >> (7 - (stop & 0x7));
+ }
+}
+
+/*
+ * __bit_ffc --
+ * Find first clear bit in name, return 0 on success, -1 on no bit clear.
+ */
+static inline int
+__bit_ffc(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
+{
+ uint8_t lb;
+ uint64_t byte, stopbyte, value;
+
+ value = 0; /* -Wuninitialized */
+
+ if (nbits == 0)
+ return (-1);
+
+ for (byte = 0,
+ stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte)
+ if (bitf[byte] != 0xff) {
+ value = byte << 3;
+ for (lb = bitf[byte]; lb & 0x01; ++value, lb >>= 1)
+ ;
+ break;
+ }
+
+ if (byte > stopbyte || value >= nbits)
+ return (-1);
+
+ *retp = value;
+ return (0);
+}
+
+/*
+ * __bit_ffs --
+ * Find first set bit in name, return 0 on success, -1 on no bit set.
+ */
+static inline int
+__bit_ffs(uint8_t *bitf, uint64_t nbits, uint64_t *retp)
+{
+ uint8_t lb;
+ uint64_t byte, stopbyte, value;
+
+ value = 0;
+ if (nbits == 0)
+ return (-1);
+
+ for (byte = 0,
+ stopbyte = __bit_byte(nbits - 1); byte <= stopbyte; ++byte)
+ if (bitf[byte] != 0) {
+ value = byte << 3;
+ for (lb = bitf[byte]; !(lb & 0x01); ++value, lb >>= 1)
+ ;
+ break;
+ }
+
+ if (byte > stopbyte || value >= nbits)
+ return (-1);
+
+ *retp = value;
+ return (0);
+}
+
+/*
+ * __bit_getv --
+ * Return a fixed-length column store bit-field value.
+ */
+static inline uint8_t
+__bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width)
+{
+ uint8_t value;
+ uint64_t bit;
+
+#define __BIT_GET(len, mask) \
+ case len: \
+ if (__bit_test(bitf, bit)) \
+ value |= mask; \
+ ++bit \
+ /* FALLTHROUGH */
+
+ value = 0;
+ bit = entry * width;
+
+ /*
+ * Fast-path single bytes, do repeated tests for the rest: we could
+ * slice-and-dice instead, but the compiler is probably going to do
+ * a better job than I will.
+ */
+ switch (width) {
+ case 8:
+ return (bitf[__bit_byte(bit)]);
+ __BIT_GET(7, 0x40);
+ __BIT_GET(6, 0x20);
+ __BIT_GET(5, 0x10);
+ __BIT_GET(4, 0x08);
+ __BIT_GET(3, 0x04);
+ __BIT_GET(2, 0x02);
+ __BIT_GET(1, 0x01);
+ }
+ return (value);
+}
+
+/*
+ * __bit_getv_recno --
+ * Return a record number's bit-field value.
+ */
+static inline uint8_t
+__bit_getv_recno(WT_PAGE *page, uint64_t recno, uint8_t width)
+{
+ return (__bit_getv(
+ page->pg_fix_bitf, recno - page->pg_fix_recno, width));
+}
+
+/*
+ * __bit_setv --
+ * Set a fixed-length column store bit-field value.
+ */
+static inline void
+__bit_setv(uint8_t *bitf, uint64_t entry, uint8_t width, uint8_t value)
+{
+ uint64_t bit;
+
+#define __BIT_SET(len, mask) \
+ case len: \
+ if (value & (mask)) \
+ __bit_set(bitf, bit); \
+ else \
+ __bit_clear(bitf, bit); \
+ ++bit \
+ /* FALLTHROUGH */
+
+ bit = entry * width;
+
+ /*
+ * Fast-path single bytes, do repeated tests for the rest: we could
+ * slice-and-dice instead, but the compiler is probably going to do
+ * a better job than I will.
+ */
+ switch (width) {
+ case 8:
+ bitf[__bit_byte(bit)] = value;
+ return;
+ __BIT_SET(7, 0x40);
+ __BIT_SET(6, 0x20);
+ __BIT_SET(5, 0x10);
+ __BIT_SET(4, 0x08);
+ __BIT_SET(3, 0x04);
+ __BIT_SET(2, 0x02);
+ __BIT_SET(1, 0x01);
+ }
+}
+
+/*
+ * __bit_setv_recno --
+ * Set a record number's bit-field value.
+ */
+static inline void
+__bit_setv_recno(WT_PAGE *page, uint64_t recno, uint8_t width, uint8_t value)
+{
+ __bit_setv(page->pg_fix_bitf, recno - page->pg_fix_recno, width, value);
+}
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
new file mode 100644
index 00000000000..10fa51243ac
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WiredTiger's block manager interface.
+ */
+
+/*
+ * The file's description is written into the first block of the file, which
+ * means we can use an offset of 0 as an invalid offset.
+ */
+#define WT_BLOCK_INVALID_OFFSET 0
+
+/*
+ * The block manager maintains three per-checkpoint extent lists:
+ * alloc: the extents allocated in this checkpoint
+ * avail: the extents available for allocation
+ * discard: the extents freed in this checkpoint
+ *
+ * An extent list is based on two skiplists: first, a by-offset list linking
+ * WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
+ * list linking WT_SIZE elements and sorted by chunk size (low-to-high).
+ *
+ * Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
+ * linking WT_EXT elements and sorted by file offset (low-to-high). This list
+ * has an entry for extents of a particular size.
+ *
+ * The trickiness is each individual WT_EXT element appears on two skiplists.
+ * In order to minimize allocation calls, we allocate a single array of WT_EXT
+ * pointers at the end of the WT_EXT structure, for both skiplists, and store
+ * the depth of the skiplist in the WT_EXT structure. The skiplist entries for
+ * the offset skiplist start at WT_EXT.next[0] and the entries for the size
+ * skiplist start at WT_EXT.next[WT_EXT.depth].
+ *
+ * One final complication: we only maintain the per-size skiplist for the avail
+ * list, the alloc and discard extent lists are not searched based on size.
+ */
+
+/*
+ * WT_EXTLIST --
+ * An extent list.
+ */
+struct __wt_extlist {
+ char *name; /* Name */
+
+ uint64_t bytes; /* Byte count */
+ uint32_t entries; /* Entry count */
+
+ wt_off_t offset; /* Written extent offset */
+ uint32_t cksum, size; /* Written extent cksum, size */
+
+ int track_size; /* Maintain per-size skiplist */
+
+ WT_EXT *last; /* Cached last element */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */
+ WT_SIZE *sz[WT_SKIP_MAXDEPTH];
+};
+
+/*
+ * WT_EXT --
+ * Encapsulation of an extent, either allocated or freed within the
+ * checkpoint.
+ */
+struct __wt_ext {
+ wt_off_t off; /* Extent's file offset */
+ wt_off_t size; /* Extent's Size */
+
+ uint8_t depth; /* Skip list depth */
+
+ /*
+ * Variable-length array, sized by the number of skiplist elements.
+ * The first depth array entries are the address skiplist elements,
+ * the second depth array entries are the size skiplist.
+ */
+ WT_EXT *next[0]; /* Offset, size skiplists */
+};
+
+/*
+ * WT_SIZE --
+ * Encapsulation of a block size skiplist entry.
+ */
+struct __wt_size {
+ wt_off_t size; /* Size */
+
+ uint8_t depth; /* Skip list depth */
+
+ WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
+
+ /*
+ * We don't use a variable-length array for the size skiplist, we want
+ * to be able to use any cached WT_SIZE structure as the head of a list,
+ * and we don't know the related WT_EXT structure's depth.
+ */
+ WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */
+};
+
+/*
+ * WT_EXT_FOREACH --
+ * Walk a block manager skiplist.
+ * WT_EXT_FOREACH_OFF --
+ * Walk a block manager skiplist where the WT_EXT.next entries are offset
+ * by the depth.
+ */
+#define WT_EXT_FOREACH(skip, head) \
+ for ((skip) = (head)[0]; \
+ (skip) != NULL; (skip) = (skip)->next[0])
+#define WT_EXT_FOREACH_OFF(skip, head) \
+ for ((skip) = (head)[0]; \
+ (skip) != NULL; (skip) = (skip)->next[(skip)->depth])
+
+/*
+ * Checkpoint cookie: carries a version number as I don't want to rev the schema
+ * file version should the default block manager checkpoint format change.
+ *
+ * Version #1 checkpoint cookie format:
+ * [1] [root addr] [alloc addr] [avail addr] [discard addr]
+ * [file size] [checkpoint size] [write generation]
+ */
+#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */
+#define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */
+struct __wt_block_ckpt {
+ uint8_t version; /* Version */
+
+ wt_off_t root_offset; /* The root */
+ uint32_t root_cksum, root_size;
+
+ WT_EXTLIST alloc; /* Extents allocated */
+ WT_EXTLIST avail; /* Extents available */
+ WT_EXTLIST discard; /* Extents discarded */
+
+ wt_off_t file_size; /* Checkpoint file size */
+ uint64_t ckpt_size; /* Checkpoint byte count */
+
+ WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */
+
+ /*
+ * Checkpoint archive: the block manager may potentially free a lot of
+ * memory from the allocation and discard extent lists when checkpoint
+ * completes. Put it off until the checkpoint resolves, that lets the
+ * upper btree layer continue eviction sooner.
+ */
+ WT_EXTLIST ckpt_alloc; /* Checkpoint archive */
+ WT_EXTLIST ckpt_discard; /* Checkpoint archive */
+};
+
+/*
+ * WT_BM --
+ * Block manager handle, references a single checkpoint in a file.
+ */
+struct __wt_bm {
+ /* Methods */
+ int (*addr_string)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+ int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ u_int (*block_header)(WT_BM *);
+ int (*checkpoint)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, int);
+ int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
+ const uint8_t *, size_t, uint8_t *, size_t *, int);
+ int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *);
+ int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
+ int (*close)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*compact_page_skip)
+ (WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *);
+ int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *);
+ int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
+ int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*read)
+ (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
+ int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*salvage_next)
+ (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, int *);
+ int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
+ int (*salvage_valid)
+ (WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, int);
+ int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
+ int (*sync)(WT_BM *, WT_SESSION_IMPL *, int);
+ int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
+ int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
+ int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *);
+ int (*write) (WT_BM *,
+ WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int);
+ int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
+
+ WT_BLOCK *block; /* Underlying file */
+
+ void *map; /* Mapped region */
+ size_t maplen;
+ void *mappingcookie;
+
+ /*
+ * There's only a single block manager handle that can be written, all
+ * others are checkpoints.
+ */
+ int is_live; /* The live system */
+};
+
+/*
+ * WT_BLOCK --
+ * Block manager handle, references a single file.
+ */
+struct __wt_block {
+ const char *name; /* Name */
+
+ /* A list of block manager handles, sharing a file descriptor. */
+ uint32_t ref; /* References */
+ WT_FH *fh; /* Backing file handle */
+ TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */
+
+ /* Configuration information, set when the file is opened. */
+ int allocfirst; /* Allocation is first-fit */
+ int allocfirst_save; /* Allocation is first-fit, saved */
+ uint32_t allocsize; /* Allocation size */
+ size_t os_cache; /* System buffer cache flush max */
+ size_t os_cache_max;
+ size_t os_cache_dirty; /* System buffer cache write max */
+ size_t os_cache_dirty_max;
+
+ u_int block_header; /* Header length */
+
+ /*
+ * There is only a single checkpoint in a file that can be written. The
+ * information could logically live in the WT_BM structure, but then we
+ * would be re-creating it every time we opened a new checkpoint and I'd
+ * rather not do that. So, it's stored here, only accessed by one WT_BM
+ * handle.
+ */
+ WT_SPINLOCK live_lock; /* Live checkpoint lock */
+ WT_BLOCK_CKPT live; /* Live checkpoint */
+ int ckpt_inprogress;/* Live checkpoint in progress */
+
+ /* Salvage support */
+ wt_off_t slvg_off; /* Salvage file offset */
+
+ /* Verification support */
+ int verify; /* If performing verification */
+ wt_off_t verify_size; /* Checkpoint's file size */
+ WT_EXTLIST verify_alloc; /* Verification allocation list */
+ uint64_t frags; /* Maximum frags in the file */
+ uint8_t *fragfile; /* Per-file frag tracking list */
+ uint8_t *fragckpt; /* Per-checkpoint frag tracking list */
+};
+
+/*
+ * WT_BLOCK_DESC --
+ * The file's description.
+ */
+struct __wt_block_desc {
+#define WT_BLOCK_MAGIC 120897
+ uint32_t magic; /* 00-03: Magic number */
+#define WT_BLOCK_MAJOR_VERSION 1
+ uint16_t majorv; /* 04-05: Major version */
+#define WT_BLOCK_MINOR_VERSION 0
+ uint16_t minorv; /* 06-07: Minor version */
+
+ uint32_t cksum; /* 08-11: Description block checksum */
+
+ uint32_t unused; /* 12-15: Padding */
+};
+/*
+ * WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
+ * ensure the compiler hasn't inserted padding (padding won't cause failure,
+ * we reserve the first allocation-size block of the file for this information,
+ * but it would be worth investigation, regardless).
+ */
+#define WT_BLOCK_DESC_SIZE 16
+
+/*
+ * WT_BLOCK_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
+ */
+struct __wt_block_header {
+ /*
+ * We write the page size in the on-disk page header because it makes
+ * salvage easier. (If we don't know the expected page length, we'd
+ * have to read increasingly larger chunks from the file until we find
+ * one that checksums, and that's going to be harsh given WiredTiger's
+ * potentially large page sizes.)
+ */
+ uint32_t disk_size; /* 00-03: on-disk page size */
+
+ /*
+ * Page checksums are stored in two places. First, the page checksum
+ * is written within the internal page that references it as part of
+ * the address cookie. This is done to improve the chances of detecting
+ * not only disk corruption but other bugs (for example, overwriting a
+ * page with another valid page image). Second, a page's checksum is
+ * stored in the disk header. This is for salvage, so salvage knows it
+ * has found a page that may be useful.
+ */
+ uint32_t cksum; /* 04-07: checksum */
+
+#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */
+ uint8_t flags; /* 08: flags */
+
+ /*
+ * End the structure with 3 bytes of padding: it wastes space, but it
+ * leaves the structure 32-bit aligned and having a few bytes to play
+ * with in the future can't hurt.
+ */
+ uint8_t unused[3]; /* 09-11: unused padding */
+};
+/*
+ * WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_BLOCK_HEADER_SIZE 12
+
+/*
+ * WT_BLOCK_HEADER_BYTE
+ * WT_BLOCK_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_BLOCK_HEADER_BYTE_SIZE \
+ (WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
+#define WT_BLOCK_HEADER_BYTE(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
+
+/*
+ * Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures.
+ * We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum
+ * and on-disk size to be immediately available without decompression. We use
+ * the on-disk size and checksum during salvage to figure out where the blocks
+ * are, and the in-memory size tells us how large a buffer we need to decompress
+ * the block. We could skip less than 64B, but a 64B boundary may offer better
+ * alignment for the underlying compression engine, and skipping 64B won't make
+ * a difference in terms of compression efficiency.
+ */
+#define WT_BLOCK_COMPRESS_SKIP 64
diff --git a/src/third_party/wiredtiger/src/include/bloom.h b/src/third_party/wiredtiger/src/include/bloom.h
new file mode 100644
index 00000000000..4ae6d96b935
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/bloom.h
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+/*
+ * REFERENCES:
+ * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
+ * http://code.google.com/p/cityhash-c/
+ */
+
+struct __wt_bloom {
+ const char *uri;
+ char *config;
+ uint8_t *bitstring; /* For in memory representation. */
+ WT_SESSION_IMPL *session;
+ WT_CURSOR *c;
+
+ uint32_t k; /* The number of hash functions used. */
+ uint32_t factor; /* The number of bits per item inserted. */
+ uint64_t m; /* The number of slots in the bit string. */
+ uint64_t n; /* The number of items to be inserted. */
+};
+
+struct __wt_bloom_hash {
+ uint64_t h1, h2; /* The two hashes used to calculate bits. */
+};
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
new file mode 100644
index 00000000000..0c4fe876e5e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -0,0 +1,1015 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_PAGE_HEADER --
+ * Blocks have a common header, a WT_PAGE_HEADER structure followed by a
+ * block-manager specific structure.
+ */
+struct __wt_page_header {
+ /*
+ * The record number of the first record of the page is stored on disk
+ * so we can figure out where the column-store leaf page fits into the
+ * key space during salvage.
+ */
+ uint64_t recno; /* 00-07: column-store starting recno */
+
+ /*
+ * We maintain page write-generations in the non-transactional case
+ * as that's how salvage can determine the most recent page between
+ * pages overlapping the same key range.
+ */
+ uint64_t write_gen; /* 08-15: write generation */
+
+ /*
+ * The page's in-memory size isn't rounded or aligned, it's the actual
+ * number of bytes the disk-image consumes when instantiated in memory.
+ */
+ uint32_t mem_size; /* 16-19: in-memory page size */
+
+ union {
+ uint32_t entries; /* 20-23: number of cells on page */
+ uint32_t datalen; /* 20-23: overflow data length */
+ } u;
+
+ uint8_t type; /* 24: page type */
+
+#define WT_PAGE_COMPRESSED 0x01 /* Page is compressed on disk */
+#define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */
+#define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */
+ uint8_t flags; /* 25: flags */
+
+ /*
+ * End the structure with 2 bytes of padding: it wastes space, but it
+ * leaves the structure 32-bit aligned and having a few bytes to play
+ * with in the future can't hurt.
+ */
+ uint8_t unused[2]; /* 26-27: unused padding */
+};
+/*
+ * WT_PAGE_HEADER_SIZE is the number of bytes we allocate for the structure: if
+ * the compiler inserts padding it will break the world.
+ */
+#define WT_PAGE_HEADER_SIZE 28
+
+/*
+ * The block-manager specific information immediately follows the WT_PAGE_HEADER
+ * structure.
+ */
+#define WT_BLOCK_HEADER_REF(dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_SIZE))
+
+/*
+ * WT_PAGE_HEADER_BYTE --
+ * WT_PAGE_HEADER_BYTE_SIZE --
+ * The first usable data byte on the block (past the combined headers).
+ */
+#define WT_PAGE_HEADER_BYTE_SIZE(btree) \
+ ((u_int)(WT_PAGE_HEADER_SIZE + (btree)->block_header))
+#define WT_PAGE_HEADER_BYTE(btree, dsk) \
+ ((void *)((uint8_t *)(dsk) + WT_PAGE_HEADER_BYTE_SIZE(btree)))
+
+/*
+ * WT_ADDR --
+ * An in-memory structure to hold a block's location.
+ */
+struct __wt_addr {
+ uint8_t *addr; /* Block-manager's cookie */
+ uint8_t size; /* Block-manager's cookie length */
+
+#define WT_ADDR_INT 1 /* Internal page */
+#define WT_ADDR_LEAF 2 /* Leaf page */
+#define WT_ADDR_LEAF_NO 3 /* Leaf page, no overflow */
+ uint8_t type;
+
+ /*
+ * If an address is both as an address for the previous and the current
+ * multi-block reconciliations, that is, a block we're writing matches
+ * the block written the last time, it will appear in both the current
+ * boundary points as well as the page modification's list of previous
+ * blocks. The reuse flag is how we know that's happening so the block
+ * is treated correctly (not free'd on error, for example).
+ */
+ uint8_t reuse;
+};
+
+/*
+ * Overflow tracking for reuse: When a page is reconciled, we write new K/V
+ * overflow items. If pages are reconciled multiple times, we need to know
+ * if we've already written a particular overflow record (so we don't write
+ * it again), as well as if we've modified an overflow record previously
+ * written (in which case we want to write a new record and discard blocks
+ * used by the previously written record). Track overflow records written
+ * for the page, storing the values in a skiplist with the record's value as
+ * the "key".
+ */
+struct __wt_ovfl_reuse {
+ uint32_t value_offset; /* Overflow value offset */
+ uint32_t value_size; /* Overflow value size */
+ uint8_t addr_offset; /* Overflow addr offset */
+ uint8_t addr_size; /* Overflow addr size */
+
+ /*
+ * On each page reconciliation, we clear the entry's in-use flag, and
+ * reset it as the overflow record is re-used. After reconciliation
+ * completes, unused skiplist entries are discarded, along with their
+ * underlying blocks.
+ *
+ * On each page reconciliation, set the just-added flag for each new
+ * skiplist entry; if reconciliation fails for any reason, discard the
+ * newly added skiplist entries, along with their underlying blocks.
+ */
+#define WT_OVFL_REUSE_INUSE 0x01
+#define WT_OVFL_REUSE_JUST_ADDED 0x02
+ uint8_t flags;
+
+ /*
+ * The untyped address immediately follows the WT_OVFL_REUSE structure,
+ * the untyped value immediately follows the address.
+ */
+#define WT_OVFL_REUSE_ADDR(p) \
+ ((void *)((uint8_t *)(p) + (p)->addr_offset))
+#define WT_OVFL_REUSE_VALUE(p) \
+ ((void *)((uint8_t *)(p) + (p)->value_offset))
+
+ WT_OVFL_REUSE *next[0]; /* Forward-linked skip list */
+};
+
+/*
+ * Overflow tracking for cached values: When a page is reconciled, we write new
+ * K/V overflow items, and discard previous underlying blocks. If there's a
+ * transaction in the system that needs to read the previous value, we have to
+ * cache the old value until no running transaction needs it.
+ */
+struct __wt_ovfl_txnc {
+ uint64_t current; /* Maximum transaction ID at store */
+
+ uint32_t value_offset; /* Overflow value offset */
+ uint32_t value_size; /* Overflow value size */
+ uint8_t addr_offset; /* Overflow addr offset */
+ uint8_t addr_size; /* Overflow addr size */
+
+ /*
+ * The untyped address immediately follows the WT_OVFL_TXNC
+ * structure, the untyped value immediately follows the address.
+ */
+#define WT_OVFL_TXNC_ADDR(p) \
+ ((void *)((uint8_t *)(p) + (p)->addr_offset))
+#define WT_OVFL_TXNC_VALUE(p) \
+ ((void *)((uint8_t *)(p) + (p)->value_offset))
+
+ WT_OVFL_TXNC *next[0]; /* Forward-linked skip list */
+};
+
+/*
+ * WT_PAGE_MODIFY --
+ * When a page is modified, there's additional information to maintain.
+ */
+struct __wt_page_modify {
+ /*
+ * Track the highest transaction ID at which the page was written to
+ * disk. This can be used to avoid trying to write the page multiple
+ * times if a snapshot is keeping old versions pinned (e.g., in a
+ * checkpoint).
+ */
+ uint64_t disk_snap_min;
+
+ /* The largest transaction ID seen on the page by reconciliation. */
+ uint64_t rec_max_txn;
+
+ /* The first unwritten transaction ID (approximate). */
+ uint64_t first_dirty_txn;
+
+ /* The largest update transaction ID (approximate). */
+ uint64_t update_txn;
+
+ /* Dirty bytes added to the cache. */
+ uint64_t bytes_dirty;
+
+ /*
+ * When pages are reconciled, the result is one or more replacement
+ * blocks. A replacement block can be in one of two states: it was
+ * written to disk, and so we have a block address, or it contained
+ * unresolved modifications and we have a disk image for it with a
+ * list of those unresolved modifications. The former is the common
+ * case: we only build lists of unresolved modifications when we're
+ * evicting a page, and we only expect to see unresolved modifications
+ * on a page being evicted in the case of a hot page that's too large
+ * to keep in memory as it is. In other words, checkpoints will skip
+ * unresolved modifications, and will write the blocks rather than
+ * build lists of unresolved modifications.
+ *
+ * Ugly union/struct layout to conserve memory, we never have both
+ * a replace address and multiple replacement blocks.
+ */
+ union {
+ WT_ADDR replace; /* Single, written replacement block */
+#define mod_replace u1.replace
+
+ struct { /* Multiple replacement blocks */
+ struct __wt_multi {
+ /*
+ * Block's key: either a column-store record number or a
+ * row-store variable length byte string.
+ */
+ union {
+ uint64_t recno;
+ WT_IKEY *ikey;
+ } key;
+
+ /*
+ * Eviction, but block wasn't written: unresolved updates and
+ * associated disk image.
+ *
+ * Skipped updates are either a WT_INSERT, or a row-store leaf
+ * page entry.
+ */
+ struct __wt_upd_skipped {
+ WT_INSERT *ins;
+ WT_ROW *rip;
+ } *skip;
+ uint32_t skip_entries;
+ void *skip_dsk;
+
+ /*
+ * Block was written: address, size and checksum.
+ * On subsequent reconciliations of this page, we avoid writing
+ * the block if it's unchanged by comparing size and checksum;
+ * the reuse flag is set when the block is unchanged and we're
+ * reusing a previous address.
+ */
+ WT_ADDR addr;
+ uint32_t size;
+ uint32_t cksum;
+ } *multi;
+ uint32_t multi_entries; /* Multiple blocks element count */
+ } m;
+#define mod_multi u1.m.multi
+#define mod_multi_entries u1.m.multi_entries
+ } u1;
+
+ /*
+ * Internal pages need to be able to chain root-page splits and have a
+ * special transactional eviction requirement. Column-store leaf pages
+ * need update and append lists.
+ *
+ * Ugly union/struct layout to conserve memory, a page is either a leaf
+ * page or an internal page.
+ */
+ union {
+ struct {
+ /*
+ * When a root page splits, we create a new page and write it;
+ * the new page can also split and so on, and we continue this
+ * process until we write a single replacement root page. We
+ * use the root split field to track the list of created pages
+ * so they can be discarded when no longer needed.
+ */
+ WT_PAGE *root_split; /* Linked list of root split pages */
+
+ /*
+ * When we deepen the tree, newly created internal pages cannot
+ * be evicted until all threads have exited the original page
+ * index structure. We set a transaction value during the split
+ * that's checked during eviction.
+ */
+ uint64_t split_txn; /* Split eviction transaction value */
+ } intl;
+#define mod_root_split u2.intl.root_split
+#define mod_split_txn u2.intl.split_txn
+ struct {
+ /*
+ * Appended items to column-stores: there is only a single one
+ * of these per column-store tree.
+ */
+ WT_INSERT_HEAD **append;
+
+ /*
+ * Updated items in column-stores: variable-length RLE entries
+ * can expand to multiple entries which requires some kind of
+ * list we can expand on demand. Updated items in fixed-length
+ * files could be done based on an WT_UPDATE array as in
+ * row-stores, but there can be a very large number of bits on
+ * a single page, and the cost of the WT_UPDATE array would be
+ * huge.
+ */
+ WT_INSERT_HEAD **update;
+ } leaf;
+#define mod_append u2.leaf.append
+#define mod_update u2.leaf.update
+ } u2;
+
+ /*
+ * Overflow record tracking for reconciliation. We assume overflow
+ * records are relatively rare, so we don't allocate the structures
+ * to track them until we actually see them in the data.
+ */
+ struct __wt_ovfl_track {
+ /*
+ * Overflow key/value address/byte-string pairs we potentially
+ * reuse each time we reconcile the page.
+ */
+ WT_OVFL_REUSE *ovfl_reuse[WT_SKIP_MAXDEPTH];
+
+ /*
+ * Overflow value address/byte-string pairs cached until no
+ * running transaction will possibly read them.
+ */
+ WT_OVFL_TXNC *ovfl_txnc[WT_SKIP_MAXDEPTH];
+
+ /*
+ * Overflow key/value addresses to be discarded from the block
+ * manager after reconciliation completes successfully.
+ */
+ WT_CELL **discard;
+ size_t discard_entries;
+ size_t discard_allocated;
+ } *ovfl_track;
+
+ /*
+ * The write generation is incremented when a page is modified, a page
+ * is clean if the write generation is 0.
+ *
+ * !!!
+ * 4B values are probably larger than required, but I'm more confident
+ * 4B types will always be backed by atomic writes to memory.
+ */
+ uint32_t write_gen;
+
+#define WT_PAGE_LOCK(s, p) \
+ __wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
+#define WT_PAGE_UNLOCK(s, p) \
+ __wt_spin_unlock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
+ uint8_t page_lock; /* Page's spinlock */
+
+#define WT_PM_REC_EMPTY 0x01 /* Reconciliation: no replacement */
+#define WT_PM_REC_MULTIBLOCK 0x02 /* Reconciliation: multiple blocks */
+#define WT_PM_REC_REPLACE 0x04 /* Reconciliation: single block */
+#define WT_PM_REC_MASK \
+ (WT_PM_REC_EMPTY | WT_PM_REC_MULTIBLOCK | WT_PM_REC_REPLACE)
+ uint8_t flags; /* Page flags */
+};
+
+/*
+ * WT_PAGE --
+ * The WT_PAGE structure describes the in-memory page information.
+ */
+struct __wt_page {
+ /* Per page-type information. */
+ union {
+ /*
+ * Internal pages (both column- and row-store).
+ *
+ * The page record number is only used by column-store, but it
+ * makes some things simpler and it doesn't cost us any memory,
+ * other structures in this union are still as large.
+ *
+ * In-memory internal pages have an array of pointers to child
+ * structures, maintained in collated order. When a page is
+ * read into memory, the initial list of children is stored in
+ * the "orig_index" field, and it and the collated order are
+ * the same. After a page splits, the collated order and the
+ * original order will differ.
+ *
+ * Multiple threads of control may be searching the in-memory
+ * internal page and a child page of the internal page may
+ * cause a split at any time. When a page splits, a new array
+ * is allocated and atomically swapped into place. Threads in
+ * the old array continue without interruption (the old array is
+ * still valid), but have to avoid racing. No barrier is needed
+ * because the array reference is updated atomically, but code
+ * reading the fields multiple times would be a very bad idea.
+ * Specifically, do not do this:
+ * WT_REF **refp = page->u.intl__index->index;
+ * uint32_t entries = page->u.intl__index->entries;
+ *
+ * The field is declared volatile (so the compiler knows not to
+ * read it multiple times), and we obscure the field name and
+ * use a copy macro in all references to the field (so the code
+ * doesn't read it multiple times).
+ */
+ struct {
+ uint64_t recno; /* Starting recno */
+ WT_REF *parent_ref; /* Parent reference */
+
+ struct __wt_page_index {
+ uint32_t entries;
+ WT_REF **index;
+ } * volatile __index; /* Collated children */
+ } intl;
+#undef pg_intl_recno
+#define pg_intl_recno u.intl.recno
+#define pg_intl_parent_ref u.intl.parent_ref
+
+ /*
+ * Macros to copy/set the index because the name is obscured to ensure
+ * the field isn't read multiple times.
+ */
+#define WT_INTL_INDEX_COPY(page) ((page)->u.intl.__index)
+#define WT_INTL_INDEX_SET(page, v) do { \
+ WT_WRITE_BARRIER(); \
+ ((page)->u.intl.__index) = (v); \
+} while (0)
+
+ /*
+ * Macro to walk the list of references in an internal page.
+ */
+#define WT_INTL_FOREACH_BEGIN(session, page, ref) do { \
+ WT_PAGE_INDEX *__pindex; \
+ WT_REF **__refp; \
+ WT_SESSION_IMPL *__session = (session); \
+ uint32_t __entries; \
+ WT_ENTER_PAGE_INDEX(session); \
+ for (__pindex = WT_INTL_INDEX_COPY(page), \
+ __refp = __pindex->index, \
+ __entries = __pindex->entries; __entries > 0; --__entries) {\
+ (ref) = *__refp++;
+#define WT_INTL_FOREACH_END \
+ } \
+ WT_LEAVE_PAGE_INDEX(__session); \
+ } while (0)
+
+ /* Row-store leaf page. */
+ struct {
+ WT_ROW *d; /* Key/value pairs */
+
+ /*
+ * The column-store leaf page modification structures
+ * live in the WT_PAGE_MODIFY structure to keep the
+ * WT_PAGE structure as small as possible for read-only
+ * pages. For consistency, we could move the row-store
+ * modification structures into WT_PAGE_MODIFY too, but
+ * that doesn't shrink WT_PAGE any further and it would
+ * require really ugly naming inside of WT_PAGE_MODIFY
+ * to avoid growing that structure.
+ */
+ WT_INSERT_HEAD **ins; /* Inserts */
+ WT_UPDATE **upd; /* Updates */
+
+ uint32_t entries; /* Entries */
+ } row;
+#undef pg_row_d
+#define pg_row_d u.row.d
+#undef pg_row_ins
+#define pg_row_ins u.row.ins
+#undef pg_row_upd
+#define pg_row_upd u.row.upd
+#define pg_row_entries u.row.entries
+#define pg_row_entries u.row.entries
+
+ /* Fixed-length column-store leaf page. */
+ struct {
+ uint64_t recno; /* Starting recno */
+
+ uint8_t *bitf; /* Values */
+ uint32_t entries; /* Entries */
+ } col_fix;
+#undef pg_fix_recno
+#define pg_fix_recno u.col_fix.recno
+#undef pg_fix_bitf
+#define pg_fix_bitf u.col_fix.bitf
+#undef pg_fix_entries
+#define pg_fix_entries u.col_fix.entries
+
+ /* Variable-length column-store leaf page. */
+ struct {
+ uint64_t recno; /* Starting recno */
+
+ WT_COL *d; /* Values */
+
+ /*
+ * Variable-length column-store files maintain a list of
+ * RLE entries on the page so it's unnecessary to walk
+ * the page counting records to find a specific entry.
+ */
+ WT_COL_RLE *repeats; /* RLE array for lookups */
+ uint32_t nrepeats; /* Number of repeat slots */
+
+ uint32_t entries; /* Entries */
+ } col_var;
+#undef pg_var_recno
+#define pg_var_recno u.col_var.recno
+#undef pg_var_d
+#define pg_var_d u.col_var.d
+#undef pg_var_repeats
+#define pg_var_repeats u.col_var.repeats
+#undef pg_var_nrepeats
+#define pg_var_nrepeats u.col_var.nrepeats
+#undef pg_var_entries
+#define pg_var_entries u.col_var.entries
+ } u;
+
+ /* Page's on-disk representation: NULL for pages created in memory. */
+ const WT_PAGE_HEADER *dsk;
+
+ /* If/when the page is modified, we need lots more information. */
+ WT_PAGE_MODIFY *modify;
+
+ /*
+ * The page's read generation acts as an LRU value for each page in the
+ * tree; it is used by the eviction server thread to select pages to be
+ * discarded from the in-memory tree.
+ *
+ * The read generation is a 64-bit value, if incremented frequently, a
+ * 32-bit value could overflow.
+ *
+ * The read generation is a piece of shared memory potentially read
+ * by many threads. We don't want to update page read generations for
+ * in-cache workloads and suffer the cache misses, so we don't simply
+ * increment the read generation value on every access. Instead, the
+ * read generation is incremented by the eviction server each time it
+ * becomes active. To avoid incrementing a page's read generation too
+ * frequently, it is set to a future point.
+ */
+#define WT_READGEN_NOTSET 0
+#define WT_READGEN_OLDEST 1
+#define WT_READGEN_STEP 100
+ uint64_t read_gen;
+
+ uint64_t memory_footprint; /* Memory attached to the page */
+
+#define WT_PAGE_IS_INTERNAL(page) \
+ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT)
+#define WT_PAGE_INVALID 0 /* Invalid page */
+#define WT_PAGE_BLOCK_MANAGER 1 /* Block-manager page */
+#define WT_PAGE_COL_FIX 2 /* Col-store fixed-len leaf */
+#define WT_PAGE_COL_INT 3 /* Col-store internal page */
+#define WT_PAGE_COL_VAR 4 /* Col-store var-length leaf page */
+#define WT_PAGE_OVFL 5 /* Overflow page */
+#define WT_PAGE_ROW_INT 6 /* Row-store internal page */
+#define WT_PAGE_ROW_LEAF 7 /* Row-store leaf page */
+ uint8_t type; /* Page type */
+
+#define WT_PAGE_BUILD_KEYS 0x01 /* Keys have been built in memory */
+#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
+#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
+#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
+#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_SPLITTING 0x20 /* An internal page is growing. */
+ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
+};
+
+/*
+ * WT_PAGE_DISK_OFFSET, WT_PAGE_REF_OFFSET --
+ * Return the offset/pointer of a pointer/offset in a page disk image.
+ */
+#define WT_PAGE_DISK_OFFSET(page, p) \
+ WT_PTRDIFF32(p, (page)->dsk)
+#define WT_PAGE_REF_OFFSET(page, o) \
+ ((void *)((uint8_t *)((page)->dsk) + (o)))
+
+/*
+ * Page state.
+ *
+ * Synchronization is based on the WT_REF->state field, which has a number of
+ * possible states:
+ *
+ * WT_REF_DISK:
+ * The initial setting before a page is brought into memory, and set as a
+ * result of page eviction; the page is on disk, and must be read into
+ * memory before use. WT_REF_DISK has a value of 0 (the default state
+ * after allocating cleared memory).
+ *
+ * WT_REF_DELETED:
+ * The page is on disk, but has been deleted from the tree; we can delete
+ * row-store leaf pages without reading them if they don't reference
+ * overflow items.
+ *
+ * WT_REF_LOCKED:
+ * Locked for exclusive access. In eviction, this page or a parent has
+ * been selected for eviction; once hazard pointers are checked, the page
+ * will be evicted. When reading a page that was previously deleted, it
+ * is locked until the page is in memory with records marked deleted. The
+ * thread that set the page to WT_REF_LOCKED has exclusive access, no
+ * other thread may use the WT_REF until the state is changed.
+ *
+ * WT_REF_MEM:
+ * Set by a reading thread once the page has been read from disk; the page
+ * is in the cache and the page reference is OK.
+ *
+ * WT_REF_READING:
+ * Set by a reading thread before reading an ordinary page from disk;
+ * other readers of the page wait until the read completes. Sync can
+ * safely skip over such pages: they are clean by definition.
+ *
+ * WT_REF_SPLIT:
+ * Set when the page is split; the WT_REF is dead and can no longer be
+ * used.
+ *
+ * The life cycle of a typical page goes like this: pages are read into memory
+ * from disk and their state set to WT_REF_MEM. When the page is selected for
+ * eviction, the page state is set to WT_REF_LOCKED. In all cases, evicting
+ * threads reset the page's state when finished with the page: if eviction was
+ * successful (a clean page was discarded, and a dirty page was written to disk
+ * and then discarded), the page state is set to WT_REF_DISK; if eviction failed
+ * because the page was busy, page state is reset to WT_REF_MEM.
+ *
+ * Readers check the state field and if it's WT_REF_MEM, they set a hazard
+ * pointer to the page, flush memory and re-confirm the page state. If the
+ * page state is unchanged, the reader has a valid reference and can proceed.
+ *
+ * When an evicting thread wants to discard a page from the tree, it sets the
+ * WT_REF_LOCKED state, flushes memory, then checks hazard pointers. If a
+ * hazard pointer is found, state is reset to WT_REF_MEM, restoring the page
+ * to the readers. If the evicting thread does not find a hazard pointer,
+ * the page is evicted.
+ */
+typedef enum __wt_page_state {
+ WT_REF_DISK=0, /* Page is on disk */
+ WT_REF_DELETED, /* Page is on disk, but deleted */
+ WT_REF_LOCKED, /* Page locked for exclusive access */
+ WT_REF_MEM, /* Page is in cache and valid */
+ WT_REF_READING, /* Page being read */
+ WT_REF_SPLIT /* Page was split */
+} WT_PAGE_STATE;
+
+/*
+ * WT_PAGE_DELETED --
+ * Related information for fast-delete, on-disk pages.
+ */
+struct __wt_page_deleted {
+ uint64_t txnid; /* Transaction ID */
+
+ WT_UPDATE **update_list; /* List of updates for abort */
+};
+
+/*
+ * WT_REF --
+ * A single in-memory page and the state information used to determine if
+ * it's OK to dereference the pointer to the page.
+ */
+struct __wt_ref {
+ WT_PAGE *page; /* Page */
+
+ /*
+ * When the tree deepens as a result of a split, the home page value
+ * changes. Don't cache it, we need to see that change when looking
+ * up our slot in the page's index structure.
+ */
+ WT_PAGE * volatile home; /* Reference page */
+ uint32_t ref_hint; /* Reference page index hint */
+
+ volatile WT_PAGE_STATE state; /* Page state */
+
+ /*
+ * Address: on-page cell if read from backing block, off-page WT_ADDR
+ * if instantiated in-memory, or NULL if page created in-memory.
+ */
+ void *addr;
+
+ /*
+ * The child page's key. Do NOT change this union without reviewing
+ * __wt_ref_key.
+ */
+ union {
+ uint64_t recno; /* Column-store: starting recno */
+ void *ikey; /* Row-store: key */
+ } key;
+
+ WT_PAGE_DELETED *page_del; /* Deleted on-disk page information */
+};
+/*
+ * WT_REF_SIZE is the expected structure size -- we verify the build to ensure
+ * the compiler hasn't inserted padding which would break the world.
+ */
+#define WT_REF_SIZE 48
+
+/*
+ * WT_ROW --
+ * Each in-memory page row-store leaf page has an array of WT_ROW structures:
+ * this is created from on-page data when a page is read from the file. It's
+ * sorted by key, fixed in size, and starts with a reference to on-page data.
+ *
+ * Multiple threads of control may be searching the in-memory row-store pages,
+ * and the key may be instantiated at any time. Code must be able to handle
+ * both when the key has not been instantiated (the key field points into the
+ * page's disk image), and when the key has been instantiated (the key field
+ * points outside the page's disk image). We don't need barriers because the
+ * key is updated atomically, but code that reads the key field multiple times
+ * is a very, very bad idea. Specifically, do not do this:
+ *
+ * key = rip->key;
+ * if (key_is_on_page(key)) {
+ * cell = rip->key;
+ * }
+ *
+ * The field is declared volatile (so the compiler knows it shouldn't read it
+ * multiple times), and we obscure the field name and use a copy macro in all
+ * references to the field (so the code doesn't read it multiple times), all
+ * to make sure we don't introduce this bug (again).
+ */
+struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */
+ void * volatile __key;
+};
+#define WT_ROW_KEY_COPY(rip) ((rip)->__key)
+#define WT_ROW_KEY_SET(rip, v) ((rip)->__key) = (void *)(v)
+
+/*
+ * WT_ROW_FOREACH --
+ * Walk the entries of an in-memory row-store leaf page.
+ */
+#define WT_ROW_FOREACH(page, rip, i) \
+ for ((i) = (page)->pg_row_entries, \
+ (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i))
+#define WT_ROW_FOREACH_REVERSE(page, rip, i) \
+ for ((i) = (page)->pg_row_entries, \
+ (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1); \
+ (i) > 0; --(rip), --(i))
+
+/*
+ * WT_ROW_SLOT --
+ * Return the 0-based array offset based on a WT_ROW reference.
+ */
+#define WT_ROW_SLOT(page, rip) \
+ ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d))
+
+/*
+ * WT_COL --
+ * Each in-memory variable-length column-store leaf page has an array of WT_COL
+ * structures: this is created from on-page data when a page is read from the
+ * file. It's fixed in size, and references data on the page.
+ */
+struct __wt_col {
+ /*
+ * Variable-length column-store data references are page offsets, not
+ * pointers (we boldly re-invent short pointers). The trade-off is 4B
+ * per K/V pair on a 64-bit machine vs. a single cycle for the addition
+ * of a base pointer. The on-page data is a WT_CELL (same as row-store
+ * pages).
+ *
+ * If the value is 0, it's a single, deleted record.
+ *
+ * Obscure the field name, code shouldn't use WT_COL->__col_value, the
+ * public interface is WT_COL_PTR and WT_COL_PTR_SET.
+ */
+ uint32_t __col_value;
+};
+
+/*
+ * WT_COL_RLE --
+ * In variable-length column store leaf pages, we build an array of entries
+ * with RLE counts greater than 1 when reading the page. We can do a binary
+ * search in this array, then an offset calculation to find the cell.
+ */
+struct __wt_col_rle {
+ uint64_t recno; /* Record number of first repeat. */
+ uint64_t rle; /* Repeat count. */
+ uint32_t indx; /* Slot of entry in col_var.d */
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_COL_PTR, WT_COL_PTR_SET --
+ * Return/Set a pointer corresponding to the data offset. (If the item does
+ * not exist on the page, return a NULL.)
+ */
+#define WT_COL_PTR(page, cip) \
+ ((cip)->__col_value == 0 ? \
+ NULL : WT_PAGE_REF_OFFSET(page, (cip)->__col_value))
+#define WT_COL_PTR_SET(cip, value) \
+ (cip)->__col_value = (value)
+
+/*
+ * WT_COL_FOREACH --
+ * Walk the entries of variable-length column-store leaf page.
+ */
+#define WT_COL_FOREACH(page, cip, i) \
+ for ((i) = (page)->pg_var_entries, \
+ (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i))
+
+/*
+ * WT_COL_SLOT --
+ * Return the 0-based array offset based on a WT_COL reference.
+ */
+#define WT_COL_SLOT(page, cip) \
+ ((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d))
+
+/*
+ * WT_IKEY --
+ * Instantiated key: row-store keys are usually prefix compressed and sometimes
+ * Huffman encoded or overflow objects. Normally, a row-store page in-memory
+ * key points to the on-page WT_CELL, but in some cases, we instantiate the key
+ * in memory, in which case the row-store page in-memory key points to a WT_IKEY
+ * structure.
+ */
+struct __wt_ikey {
+ uint32_t size; /* Key length */
+
+ /*
+ * If we no longer point to the key's on-page WT_CELL, we can't find its
+ * related value. Save the offset of the key cell in the page.
+ *
+ * Row-store cell references are page offsets, not pointers (we boldly
+ * re-invent short pointers). The trade-off is 4B per K/V pair on a
+ * 64-bit machine vs. a single cycle for the addition of a base pointer.
+ */
+ uint32_t cell_offset;
+
+ /* The key bytes immediately follow the WT_IKEY structure. */
+#define WT_IKEY_DATA(ikey) \
+ ((void *)((uint8_t *)(ikey) + sizeof(WT_IKEY)))
+};
+
+/*
+ * WT_UPDATE --
+ * Entries on leaf pages can be updated, either modified or deleted. Updates
+ * to entries referenced from the WT_ROW and WT_COL arrays are stored in the
+ * page's WT_UPDATE array. When the first element on a page is updated, the
+ * WT_UPDATE array is allocated, with one slot for every existing element in
+ * the page. A slot points to a WT_UPDATE structure; if more than one update
+ * is done for an entry, WT_UPDATE structures are formed into a forward-linked
+ * list.
+ */
+struct __wt_update {
+ uint64_t txnid; /* update transaction */
+
+ WT_UPDATE *next; /* forward-linked list */
+
+ /*
+ * We use the maximum size as an is-deleted flag, which means we can't
+ * store 4GB objects; I'd rather do that than increase the size of this
+ * structure for a flag bit.
+ */
+#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX)
+#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX)
+ uint32_t size; /* update length */
+
+ /* The untyped value immediately follows the WT_UPDATE structure. */
+#define WT_UPDATE_DATA(upd) \
+ ((void *)((uint8_t *)(upd) + sizeof(WT_UPDATE)))
+} WT_GCC_ATTRIBUTE((packed));
+
+/*
+ * WT_INSERT --
+ *
+ * Row-store leaf pages support inserts of new K/V pairs. When the first K/V
+ * pair is inserted, the WT_INSERT_HEAD array is allocated, with one slot for
+ * every existing element in the page, plus one additional slot. A slot points
+ * to a WT_INSERT_HEAD structure for the items which sort after the WT_ROW
+ * element that references it and before the subsequent WT_ROW element; the
+ * skiplist structure has a randomly chosen depth of next pointers in each
+ * inserted node.
+ *
+ * The additional slot is because it's possible to insert items smaller than any
+ * existing key on the page: for that reason, the first slot of the insert array
+ * holds keys smaller than any other key on the page.
+ *
+ * In column-store variable-length run-length encoded pages, a single indx
+ * entry may reference a large number of records, because there's a single
+ * on-page entry representing many identical records. (We don't expand those
+ * entries when the page comes into memory, as that would require resources as
+ * pages are moved to/from the cache, including read-only files.) Instead, a
+ * single indx entry represents all of the identical records originally found
+ * on the page.
+ *
+ * Modifying (or deleting) run-length encoded column-store records is hard
+ * because the page's entry no longer references a set of identical items. We
+ * handle this by "inserting" a new entry into the insert array, with its own
+ * record number. (This is the only case where it's possible to insert into a
+ * column-store: only appends are allowed, as insert requires re-numbering
+ * subsequent records. Berkeley DB did support mutable records, but it won't
+ * scale and it isn't useful enough to re-implement, IMNSHO.)
+ */
+struct __wt_insert {
+ WT_UPDATE *upd; /* value */
+
+ union {
+ uint64_t recno; /* column-store record number */
+ struct {
+ uint32_t offset; /* row-store key data start */
+ uint32_t size; /* row-store key data size */
+ } key;
+ } u;
+
+#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size)
+#define WT_INSERT_KEY(ins) \
+ ((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset))
+#define WT_INSERT_RECNO(ins) (((WT_INSERT *)ins)->u.recno)
+
+ WT_INSERT *next[0]; /* forward-linked skip list */
+};
+
+/*
+ * Skiplist helper macros.
+ */
+#define WT_SKIP_FIRST(ins_head) \
+ (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->head[0])
+#define WT_SKIP_LAST(ins_head) \
+ (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->tail[0])
+#define WT_SKIP_NEXT(ins) ((ins)->next[0])
+#define WT_SKIP_FOREACH(ins, ins_head) \
+ for ((ins) = WT_SKIP_FIRST(ins_head); \
+ (ins) != NULL; \
+ (ins) = WT_SKIP_NEXT(ins))
+
+/*
+ * Atomically allocate and swap a structure or array into place.
+ */
+#define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \
+ if (((v) = (dest)) == NULL) { \
+ WT_ERR(__wt_calloc_def(s, count, &(v))); \
+ if (WT_ATOMIC_CAS8(dest, NULL, v)) \
+ __wt_cache_page_inmem_incr( \
+ s, page, (count) * sizeof(*(v))); \
+ else \
+ __wt_free(s, v); \
+ } \
+} while (0)
+
+/*
+ * WT_INSERT_HEAD --
+ * The head of a skiplist of WT_INSERT items.
+ */
+struct __wt_insert_head {
+ WT_INSERT *head[WT_SKIP_MAXDEPTH]; /* first item on skiplists */
+ WT_INSERT *tail[WT_SKIP_MAXDEPTH]; /* last item on skiplists */
+};
+
+/*
+ * The row-store leaf page insert lists are arrays of pointers to structures,
+ * and may not exist. The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define WT_ROW_INSERT_SLOT(page, slot) \
+ ((page)->pg_row_ins == NULL ? NULL : (page)->pg_row_ins[slot])
+#define WT_ROW_INSERT(page, ip) \
+ WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
+#define WT_ROW_UPDATE(page, ip) \
+ ((page)->pg_row_upd == NULL ? \
+ NULL : (page)->pg_row_upd[WT_ROW_SLOT(page, ip)])
+/*
+ * WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
+ * the "one per WT_ROW slot" insert array. That's because the insert array
+ * requires an extra slot to hold keys that sort before any key found on the
+ * original page.
+ */
+#define WT_ROW_INSERT_SMALLEST(page) \
+ ((page)->pg_row_ins == NULL ? \
+ NULL : (page)->pg_row_ins[(page)->pg_row_entries])
+
+/*
+ * The column-store leaf page update lists are arrays of pointers to structures,
+ * and may not exist. The following macros return an array entry if the array
+ * of pointers and the specific structure exist, else NULL.
+ */
+#define WT_COL_UPDATE_SLOT(page, slot) \
+ ((page)->modify == NULL || (page)->modify->mod_update == NULL ? \
+ NULL : (page)->modify->mod_update[slot])
+#define WT_COL_UPDATE(page, ip) \
+ WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
+
+/*
+ * WT_COL_UPDATE_SINGLE is a single WT_INSERT list, used for any fixed-length
+ * column-store updates for a page.
+ */
+#define WT_COL_UPDATE_SINGLE(page) \
+ WT_COL_UPDATE_SLOT(page, 0)
+
+/*
+ * WT_COL_APPEND is an WT_INSERT list, used for fixed- and variable-length
+ * appends.
+ */
+#define WT_COL_APPEND(page) \
+ ((page)->modify != NULL && (page)->modify->mod_append != NULL ? \
+ (page)->modify->mod_append[0] : NULL)
+
+/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
+#define WT_FIX_FOREACH(btree, dsk, v, i) \
+ for ((i) = 0, \
+ (v) = (i) < (dsk)->u.entries ? \
+ __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), 0, (btree)->bitcnt) : 0; \
+ (i) < (dsk)->u.entries; ++(i), \
+ (v) = __bit_getv( \
+ WT_PAGE_HEADER_BYTE(btree, dsk), i, (btree)->bitcnt))
+
+/*
+ * Manage split generation numbers. Splits walk the list of sessions to check
+ * when it is safe to free structures that have been replaced. We also check
+ * that list periodically (e.g., when wrapping up a transaction) to free any
+ * memory we can.
+ *
+ * Before a thread enters code that will examine page indexes (which are
+ * swapped out by splits), it publishes a copy of the current split generation
+ * into its session. Don't assume that threads never re-enter this code: if we
+ * already have a split generation, leave it alone. If our caller is examining
+ * an index, we don't want the oldest split generation to move forward and
+ * potentially free it.
+ */
+#define WT_ENTER_PAGE_INDEX(session) do { \
+ uint64_t __prev_split_gen = (session)->split_gen; \
+ if (__prev_split_gen == 0) \
+ WT_PUBLISH((session)->split_gen, S2C(session)->split_gen)
+
+#define WT_LEAVE_PAGE_INDEX(session) \
+ if (__prev_split_gen == 0) \
+ (session)->split_gen = 0; \
+ } while (0)
+
+#define WT_WITH_PAGE_INDEX(session, e) \
+ WT_ENTER_PAGE_INDEX(session); \
+ (e); \
+ WT_LEAVE_PAGE_INDEX(session)
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
new file mode 100644
index 00000000000..05250951a65
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Supported btree formats: the "current" version is the maximum supported
+ * major/minor versions.
+ */
+#define WT_BTREE_MAJOR_VERSION_MIN 1 /* Oldest version supported */
+#define WT_BTREE_MINOR_VERSION_MIN 1
+
+#define WT_BTREE_MAJOR_VERSION_MAX 1 /* Newest version supported */
+#define WT_BTREE_MINOR_VERSION_MAX 1
+
+/*
+ * The maximum btree leaf and internal page size is 512MB (2^29). The limit
+ * is enforced in software, it could be larger, specifically, the underlying
+ * default block manager can support 4GB (2^32). Currently, the maximum page
+ * size must accommodate our dependence on the maximum page size fitting into
+ * a number of bits less than 32; see the row-store page key-lookup functions
+ * for the magic.
+ */
+#define WT_BTREE_PAGE_SIZE_MAX (512 * WT_MEGABYTE)
+
+/*
+ * The length of variable-length column-store values and row-store keys/values
+ * are stored in a 4B type, so the largest theoretical key/value item is 4GB.
+ * However, in the WT_UPDATE structure we use the UINT32_MAX size as a "deleted"
+ * flag, and second, the size of an overflow object is constrained by what an
+ * underlying block manager can actually write. (For example, in the default
+ * block manager, writing an overflow item includes the underlying block's page
+ * header and block manager specific structure, aligned to an allocation-sized
+ * unit). The btree engine limits the size of a single object to (4GB - 1KB);
+ * that gives us additional bytes if we ever want to store a structure length
+ * plus the object size in 4B, or if we need additional flag values. Attempts
+ * to store large key/value items in the tree trigger an immediate check to the
+ * block manager, to make sure it can write the item. Storing 4GB objects in a
+ * btree borders on clinical insanity, anyway.
+ *
+ * Record numbers are stored in 64-bit unsigned integers, meaning the largest
+ * record number is "really, really big".
+ */
+#define WT_BTREE_MAX_OBJECT_SIZE (UINT32_MAX - 1024)
+
+/*
+ * A location in a file is a variable-length cookie, but it has a maximum size
+ * so it's easy to create temporary space in which to store them. (Locations
+ * can't be much larger than this anyway, they must fit onto the minimum size
+ * page because a reference to an overflow page is itself a location.)
+ */
+#define WT_BTREE_MAX_ADDR_COOKIE 255 /* Maximum address cookie */
+
+/*
+ * WT_BTREE --
+ * A btree handle.
+ */
+struct __wt_btree {
+ WT_DATA_HANDLE *dhandle;
+
+ WT_CKPT *ckpt; /* Checkpoint information */
+
+ enum { BTREE_COL_FIX=1, /* Fixed-length column store */
+ BTREE_COL_VAR=2, /* Variable-length column store */
+ BTREE_ROW=3 /* Row-store */
+ } type; /* Type */
+
+ const char *key_format; /* Key format */
+ const char *value_format; /* Value format */
+ uint8_t bitcnt; /* Fixed-length field size in bits */
+
+ WT_COLLATOR *collator; /* Row-store comparator */
+ int collator_owned; /* The collator needs to be freed */
+
+ uint32_t id; /* File ID, for logging */
+
+ uint32_t key_gap; /* Row-store prefix key gap */
+
+ uint32_t allocsize; /* Allocation size */
+ uint32_t maxintlpage; /* Internal page max size */
+ uint32_t maxintlitem; /* Internal page max item size */
+ uint32_t maxleafpage; /* Leaf page max size */
+ uint32_t maxleafitem; /* Leaf page max item size */
+ uint64_t maxmempage; /* In memory page max size */
+
+ void *huffman_key; /* Key huffman encoding */
+ void *huffman_value; /* Value huffman encoding */
+
+ enum { CKSUM_ON=1, /* On */
+ CKSUM_OFF=2, /* Off */
+ CKSUM_UNCOMPRESSED=3 /* Uncompressed blocks only */
+ } checksum; /* Checksum configuration */
+
+ u_int dictionary; /* Reconcile: dictionary slots */
+ int internal_key_truncate; /* Reconcile: internal key truncate */
+ int maximum_depth; /* Reconcile: maximum tree depth */
+ int prefix_compression; /* Reconcile: prefix compression */
+ u_int prefix_compression_min; /* Reconcile: prefix compression min */
+ int split_pct; /* Reconcile: split page percent */
+ WT_COMPRESSOR *compressor; /* Reconcile: page compressor */
+ WT_RWLOCK *ovfl_lock; /* Reconcile: overflow lock */
+
+ uint64_t last_recno; /* Column-store last record number */
+
+ WT_REF root; /* Root page reference */
+ int modified; /* If the tree ever modified */
+ int bulk_load_ok; /* Bulk-load is a possibility */
+
+ WT_BM *bm; /* Block manager reference */
+ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
+
+ uint64_t write_gen; /* Write generation */
+
+ WT_REF *evict_ref; /* Eviction thread's location */
+ uint64_t evict_priority; /* Relative priority of cached pages */
+ u_int evict_walk_period; /* Skip this many LRU walks */
+ u_int evict_walk_skips; /* Number of walks skipped */
+ volatile uint32_t evict_busy; /* Count of threads in eviction */
+
+ int checkpointing; /* Checkpoint in progress */
+
+ /*
+ * We flush pages from the tree (in order to make checkpoint faster),
+ * without a high-level lock. To avoid multiple threads flushing at
+ * the same time, lock the tree.
+ */
+ WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */
+
+ /* Flags values up to 0xff are reserved for WT_DHANDLE_* */
+#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */
+#define WT_BTREE_NO_EVICTION 0x00200 /* Disable eviction */
+#define WT_BTREE_NO_HAZARD 0x00400 /* Disable hazard pointers */
+#define WT_BTREE_SALVAGE 0x00800 /* Handle is for salvage */
+#define WT_BTREE_UPGRADE 0x01000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x02000 /* Handle is for verify */
+ uint32_t flags;
+};
+
+/* Flags that make a btree handle special (not for normal use). */
+#define WT_BTREE_SPECIAL_FLAGS \
+ (WT_BTREE_BULK | WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)
+
+/*
+ * WT_SALVAGE_COOKIE --
+ * Encapsulation of salvage information for reconciliation.
+ */
+struct __wt_salvage_cookie {
+ uint64_t missing; /* Initial items to create */
+ uint64_t skip; /* Initial items to skip */
+ uint64_t take; /* Items to take */
+
+ int done; /* Ignore the rest */
+};
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
new file mode 100644
index 00000000000..b7957e6647f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -0,0 +1,1216 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_ref_is_root --
+ * Return if the page reference is for the root page.
+ */
+static inline int
+__wt_ref_is_root(WT_REF *ref)
+{
+ return (ref->home == NULL ? 1 : 0);
+}
+
+/*
+ * __wt_page_is_modified --
+ * Return if the page is dirty.
+ */
+static inline int
+__wt_page_is_modified(WT_PAGE *page)
+{
+ return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0);
+}
+
+/*
+ * Estimate the per-allocation overhead. All implementations of malloc / free
+ * have some kind of header and pad for alignment. We can't know for sure what
+ * that adds up to, but this is an estimate based on some measurements of heap
+ * size versus bytes in use.
+ */
+#define WT_ALLOC_OVERHEAD 32U
+
+/*
+ * __wt_cache_page_inmem_incr --
+ * Increment a page's memory footprint in the cache.
+ */
+static inline void
+__wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
+{
+ WT_CACHE *cache;
+
+ size += WT_ALLOC_OVERHEAD;
+
+ cache = S2C(session)->cache;
+ (void)WT_ATOMIC_ADD8(cache->bytes_inmem, size);
+ (void)WT_ATOMIC_ADD8(page->memory_footprint, size);
+ if (__wt_page_is_modified(page)) {
+ (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+ }
+}
+
+/*
+ * __wt_cache_page_inmem_decr --
+ * Decrement a page's memory footprint in the cache.
+ */
+static inline void
+__wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
+{
+ WT_CACHE *cache;
+
+ size += WT_ALLOC_OVERHEAD;
+
+ cache = S2C(session)->cache;
+ (void)WT_ATOMIC_SUB8(cache->bytes_inmem, size);
+ (void)WT_ATOMIC_SUB8(page->memory_footprint, size);
+ if (__wt_page_is_modified(page)) {
+ (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size);
+ }
+}
+
+/*
+ * __wt_cache_dirty_incr --
+ * Increment the cache dirty page/byte counts.
+ */
+static inline void
+__wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CACHE *cache;
+ size_t size;
+
+ cache = S2C(session)->cache;
+ (void)WT_ATOMIC_ADD8(cache->pages_dirty, 1);
+
+ /*
+ * Take care to read the memory_footprint once in case we are racing
+ * with updates.
+ */
+ size = page->memory_footprint;
+ (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+}
+
+/*
+ * __wt_cache_dirty_decr --
+ * Decrement the cache dirty page/byte counts.
+ */
+static inline void
+__wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CACHE *cache;
+ size_t size;
+
+ cache = S2C(session)->cache;
+
+ if (cache->pages_dirty < 1) {
+ (void)__wt_errx(session,
+ "cache dirty decrement failed: cache dirty page count went "
+ "negative");
+ cache->pages_dirty = 0;
+ } else
+ (void)WT_ATOMIC_SUB8(cache->pages_dirty, 1);
+
+ /*
+ * It is possible to decrement the footprint of the page without making
+ * the page dirty (for example when freeing an obsolete update list),
+ * so the footprint could change between read and decrement, and we
+ * might attempt to decrement by a different amount than the bytes held
+ * by the page.
+ *
+ * We catch that by maintaining a per-page dirty size, and fixing the
+ * cache stats if that is non-zero when the page is discarded.
+ *
+ * Also take care that the global size doesn't go negative. This may
+ * lead to small accounting errors (particularly on the last page of the
+ * last file in a checkpoint), but that will come out in the wash when
+ * the page is evicted.
+ */
+ size = WT_MIN(page->memory_footprint, cache->bytes_dirty);
+ (void)WT_ATOMIC_SUB8(cache->bytes_dirty, size);
+ (void)WT_ATOMIC_SUB8(page->modify->bytes_dirty, size);
+}
+
+/*
+ * __wt_cache_page_evict --
+ * Evict pages from the cache.
+ */
+static inline void
+__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CACHE *cache;
+ WT_PAGE_MODIFY *mod;
+
+ cache = S2C(session)->cache;
+ mod = page->modify;
+
+ /*
+ * In rare cases, we may race tracking a page's dirty footprint.
+ * If so, we will get here with a non-zero dirty_size in the page, and
+ * we can fix the global stats.
+ */
+ if (mod != NULL && mod->bytes_dirty != 0)
+ (void)WT_ATOMIC_SUB8(cache->bytes_dirty, mod->bytes_dirty);
+
+ WT_ASSERT(session, page->memory_footprint != 0);
+ (void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint);
+ page->memory_footprint = 0;
+
+ (void)WT_ATOMIC_ADD8(cache->pages_evict, 1);
+}
+
+/*
+ * __wt_cache_read_gen --
+ * Get the current read generation number.
+ */
+static inline uint64_t
+__wt_cache_read_gen(WT_SESSION_IMPL *session)
+{
+ return (S2C(session)->cache->read_gen);
+}
+
+/*
+ * __wt_cache_read_gen_incr --
+ * Increment the current read generation number.
+ */
+static inline void
+__wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
+{
+ ++S2C(session)->cache->read_gen;
+}
+
+/*
+ * __wt_cache_read_gen_set --
+ * Get the read generation to store in a page.
+ */
+static inline uint64_t
+__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+{
+ /*
+ * We return read-generations from the future (where "the future" is
+ * measured by increments of the global read generation). The reason
+ * is because when acquiring a new hazard pointer for a page, we can
+ * check its read generation, and if the read generation isn't less
+ * than the current global generation, we don't bother updating the
+ * page. In other words, the goal is to avoid some number of updates
+ * immediately after each update we have to make.
+ */
+ return (__wt_cache_read_gen(session) + WT_READGEN_STEP);
+}
+
+/*
+ * __wt_cache_pages_inuse --
+ * Return the number of pages in use.
+ */
+static inline uint64_t
+__wt_cache_pages_inuse(WT_CACHE *cache)
+{
+ return (cache->pages_inmem - cache->pages_evict);
+}
+
+/*
+ * __wt_cache_bytes_inuse --
+ * Return the number of bytes in use.
+ */
+static inline uint64_t
+__wt_cache_bytes_inuse(WT_CACHE *cache)
+{
+ return (cache->bytes_inmem - cache->bytes_evict);
+}
+
+/*
+ * __wt_page_refp --
+ * Return the page's index and slot for a reference.
+ */
+static inline void
+__wt_page_refp(WT_SESSION_IMPL *session,
+ WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
+{
+ WT_PAGE_INDEX *pindex;
+ uint32_t i;
+
+ WT_ASSERT(session,
+ WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE);
+
+ /*
+ * Copy the parent page's index value: the page can split at any time,
+ * but the index's value is always valid, even if it's not up-to-date.
+ */
+retry: pindex = WT_INTL_INDEX_COPY(ref->home);
+
+ /*
+ * Use the page's reference hint: it should be correct unless the page
+ * split before our slot. If the page splits after our slot, the hint
+ * will point earlier in the array than our actual slot, so the first
+ * loop is from the hint to the end of the list, and the second loop
+ * is from the start of the list to the end of the list. (The second
+ * loop overlaps the first, but that only happen in cases where we've
+ * deepened the tree and aren't going to find our slot at all, that's
+ * not worth optimizing.)
+ *
+ * It's not an error for the reference hint to be wrong, it just means
+ * the first retrieval (which sets the hint for subsequent retrievals),
+ * is slower.
+ */
+ for (i = ref->ref_hint; i < pindex->entries; ++i)
+ if (pindex->index[i]->page == ref->page) {
+ *pindexp = pindex;
+ *slotp = ref->ref_hint = i;
+ return;
+ }
+ for (i = 0; i < pindex->entries; ++i)
+ if (pindex->index[i]->page == ref->page) {
+ *pindexp = pindex;
+ *slotp = ref->ref_hint = i;
+ return;
+ }
+
+ /*
+ * If we don't find our reference, the page split into a new level and
+ * our home pointer references the wrong page. After internal pages
+ * deepen, their reference structure home value are updated; yield and
+ * wait for that to happen.
+ */
+ __wt_yield();
+ goto retry;
+}
+
+/*
+ * __wt_page_modify_init --
+ * A page is about to be modified, allocate the modification structure.
+ */
+static inline int
+__wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ return (page->modify == NULL ?
+ __wt_page_modify_alloc(session, page) : 0);
+}
+
+/*
+ * __wt_page_only_modify_set --
+ * Mark the page (but only the page) dirty.
+ */
+static inline void
+__wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ uint64_t last_running;
+
+ last_running = 0;
+ if (page->modify->write_gen == 0)
+ last_running = S2C(session)->txn_global.last_running;
+
+ /*
+ * We depend on atomic-add being a write barrier, that is, a barrier to
+ * ensure all changes to the page are flushed before updating the page
+ * write generation and/or marking the tree dirty, otherwise checkpoints
+ * and/or page reconciliation might be looking at a clean page/tree.
+ *
+ * Every time the page transitions from clean to dirty, update the cache
+ * and transactional information.
+ */
+ if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) {
+ __wt_cache_dirty_incr(session, page);
+
+ /*
+ * The page can never end up with changes older than the oldest
+ * running transaction.
+ */
+ if (F_ISSET(&session->txn, TXN_HAS_SNAPSHOT))
+ page->modify->disk_snap_min = session->txn.snap_min;
+
+ /*
+ * We won the race to dirty the page, but another thread could
+ * have committed in the meantime, and the last_running field
+ * been updated past it. That is all very unlikely, but not
+ * impossible, so we take care to read the global state before
+ * the atomic increment. If we raced with reconciliation, just
+ * leave the previous value here: at worst, we will write a
+ * page in a checkpoint when not absolutely necessary.
+ */
+ if (last_running != 0)
+ page->modify->first_dirty_txn = last_running;
+ }
+
+ /* Check if this is the largest transaction ID to update the page. */
+ if (TXNID_LT(page->modify->update_txn, session->txn.id))
+ page->modify->update_txn = session->txn.id;
+}
+
+/*
+ * __wt_page_modify_set --
+ * Mark the page and tree dirty.
+ */
+static inline void
+__wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ /*
+ * Mark the tree dirty (even if the page is already marked dirty), newly
+ * created pages to support "empty" files are dirty, but the file isn't
+ * marked dirty until there's a real change needing to be written. Test
+ * before setting the dirty flag, it's a hot cache line.
+ *
+ * The tree's modified flag is cleared by the checkpoint thread: set it
+ * and insert a barrier before dirtying the page. (I don't think it's
+ * a problem if the tree is marked dirty with all the pages clean, it
+ * might result in an extra checkpoint that doesn't do any work but it
+ * shouldn't cause problems; regardless, let's play it safe.)
+ */
+ if (S2BT(session)->modified == 0) {
+ S2BT(session)->modified = 1;
+ WT_FULL_BARRIER();
+ }
+
+ __wt_page_only_modify_set(session, page);
+}
+
+/*
+ * __wt_page_parent_modify_set --
+ * Mark the parent page and tree dirty.
+ */
+static inline int
+__wt_page_parent_modify_set(
+ WT_SESSION_IMPL *session, WT_REF *ref, int page_only)
+{
+ WT_PAGE *parent;
+
+ /*
+ * This function exists as a place to stash this comment. There are a
+ * few places where we need to dirty a page's parent. The trick is the
+ * page's parent might split at any point, and the page parent might be
+ * the wrong parent at any particular time. We ignore this and dirty
+ * whatever page the page's reference structure points to. This is safe
+ * because if we're pointing to the wrong parent, that parent must have
+ * split, deepening the tree, which implies marking the original parent
+ * and all of the newly-created children as dirty. In other words, if
+ * we have the wrong parent page, everything was marked dirty already.
+ */
+ parent = ref->home;
+ WT_RET(__wt_page_modify_init(session, parent));
+ if (page_only)
+ __wt_page_only_modify_set(session, parent);
+ else
+ __wt_page_modify_set(session, parent);
+ return (0);
+}
+
+/*
+ * __wt_off_page --
+ * Return if a pointer references off-page data.
+ */
+static inline int
+__wt_off_page(WT_PAGE *page, const void *p)
+{
+ /*
+ * There may be no underlying page, in which case the reference is
+ * off-page by definition.
+ */
+ return (page->dsk == NULL ||
+ p < (void *)page->dsk ||
+ p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size));
+}
+
+/*
+ * __wt_ref_key --
+ * Return a reference to a row-store internal page key as cheaply as
+ * possible.
+ */
+static inline void
+__wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
+{
+ uintptr_t v;
+
+ /*
+ * An internal page key is in one of two places: if we instantiated the
+ * key (for example, when reading the page), WT_REF.key.ikey references
+ * a WT_IKEY structure, otherwise WT_REF.key.ikey references an on-page
+ * key offset/length pair.
+ *
+ * Now the magic: allocated memory must be aligned to store any standard
+ * type, and we expect some standard type to require at least quad-byte
+ * alignment, so allocated memory should have some clear low-order bits.
+ * On-page objects consist of an offset/length pair: the maximum page
+ * size currently fits into 29 bits, so we use the low-order bits of the
+ * pointer to mark the other bits of the pointer as encoding the key's
+ * location and length. This breaks if allocated memory isn't aligned,
+ * of course.
+ *
+ * In this specific case, we use bit 0x01 to mark an on-page key, else
+ * it's a WT_IKEY reference. The bit pattern for internal row-store
+ * on-page keys is:
+ * 32 bits key length
+ * 31 bits page offset of the key's bytes,
+ * 1 bits flags
+ */
+#define WT_IK_FLAG 0x01
+#define WT_IK_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
+#define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32)
+#define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1)
+#define WT_IK_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 1)
+ v = (uintptr_t)ref->key.ikey;
+ if (v & WT_IK_FLAG) {
+ *(void **)keyp =
+ WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v));
+ *sizep = WT_IK_DECODE_KEY_LEN(v);
+ } else {
+ *(void **)keyp = WT_IKEY_DATA(ref->key.ikey);
+ *sizep = ((WT_IKEY *)ref->key.ikey)->size;
+ }
+}
+
+/*
+ * __wt_ref_key_onpage_set --
+ * Set a WT_REF to reference an on-page key.
+ */
+static inline void
+__wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_ref_key for an explanation of the magic.
+ */
+ v = WT_IK_ENCODE_KEY_LEN(unpack->size) |
+ WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
+ WT_IK_FLAG;
+ ref->key.ikey = (void *)v;
+}
+
+/*
+ * __wt_ref_key_instantiated --
+ * Return if a WT_REF key is instantiated.
+ */
+static inline WT_IKEY *
+__wt_ref_key_instantiated(WT_REF *ref)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_ref_key for an explanation of the magic.
+ */
+ v = (uintptr_t)ref->key.ikey;
+ return (v & WT_IK_FLAG ? NULL : ref->key.ikey);
+}
+
+/*
+ * __wt_ref_key_clear --
+ * Clear a WT_REF key.
+ */
+static inline void
+__wt_ref_key_clear(WT_REF *ref)
+{
+ /* The key union has 2 fields, both of which are 8B. */
+ ref->key.recno = 0;
+}
+
+/*
+ * __wt_row_leaf_key_info --
+ * Return a row-store leaf page key referenced by a WT_ROW if it can be
+ * had without unpacking a cell, and information about the cell, if the key
+ * isn't cheaply available.
+ */
+static inline int
+__wt_row_leaf_key_info(WT_PAGE *page, void *copy,
+ WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
+{
+ WT_IKEY *ikey;
+ uintptr_t v;
+
+ v = (uintptr_t)copy;
+
+ /*
+ * A row-store leaf page key is in one of two places: if instantiated,
+ * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
+ * references an on-page offset. Further, on-page keys are in one of
+ * two states: if the key is a simple key (not an overflow key, prefix
+ * compressed or Huffman encoded, all of which are likely), the key's
+ * offset/size is encoded in the pointer. Otherwise, the offset is to
+ * the key's on-page cell.
+ *
+ * Now the magic: allocated memory must be aligned to store any standard
+ * type, and we expect some standard type to require at least quad-byte
+ * alignment, so allocated memory should have some clear low-order bits.
+ * On-page objects consist of an offset/length pair: the maximum page
+ * size currently fits into 29 bits, so we use the low-order bits of the
+ * pointer to mark the other bits of the pointer as encoding the key's
+ * location and length. This breaks if allocated memory isn't aligned,
+ * of course.
+ *
+ * In this specific case, we use bit 0x01 to mark an on-page cell, bit
+ * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair,
+ * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells
+ * is:
+ * 29 bits page offset of the key's cell,
+ * 2 bits flags
+ *
+ * The bit pattern for on-page keys is:
+ * 32 bits key length,
+ * 29 bits page offset of the key's bytes,
+ * 2 bits flags
+ *
+ * But, while that allows us to skip decoding simple key cells, we also
+ * want to skip decoding the value cell in the case where the value cell
+ * is also simple/short. We use bit 0x03 to mark an encoded on-page key
+ * and value pair. The bit pattern for on-page key/value pairs is:
+ * 9 bits key length,
+ * 13 bits value length,
+ * 20 bits page offset of the key's bytes,
+ * 20 bits page offset of the value's bytes,
+ * 2 bits flags
+ *
+ * These bit patterns are in-memory only, of course, so can be modified
+ * (we could even tune for specific workloads). Generally, the fields
+ * are larger than the anticipated values being stored (512B keys, 8KB
+ * values, 1MB pages), hopefully that won't be necessary.
+ *
+ * This function returns a list of things about the key (instantiation
+ * reference, cell reference and key/length pair). Our callers know
+ * the order in which we look things up and the information returned;
+ * for example, the cell will never be returned if we are working with
+ * an on-page key.
+ */
+#define WT_CELL_FLAG 0x01
+#define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2)
+#define WT_CELL_DECODE_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2)
+
+#define WT_K_FLAG 0x02
+#define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
+#define WT_K_DECODE_KEY_LEN(v) ((v) >> 32)
+#define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2)
+#define WT_K_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2)
+
+#define WT_KV_FLAG 0x03
+#define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55)
+#define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55)
+#define WT_KV_MAX_KEY_LEN (0x200 - 1)
+#define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42)
+#define WT_KV_DECODE_VALUE_LEN(v) (((v) & 0x007FFC0000000000) >> 42)
+#define WT_KV_MAX_VALUE_LEN (0x2000 - 1)
+#define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22)
+#define WT_KV_DECODE_KEY_OFFSET(v) (((v) & 0x000003FFFFC00000) >> 22)
+#define WT_KV_MAX_KEY_OFFSET (0x100000 - 1)
+#define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2)
+#define WT_KV_DECODE_VALUE_OFFSET(v) (((v) & 0x00000000003FFFFC) >> 2)
+#define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1)
+ switch (v & 0x03) {
+ case WT_CELL_FLAG:
+ /* On-page cell: no instantiated key. */
+ if (ikeyp != NULL)
+ *ikeyp = NULL;
+ if (cellp != NULL)
+ *cellp =
+ WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
+ return (0);
+ case WT_K_FLAG:
+ /* Encoded key: no instantiated key, no cell. */
+ if (cellp != NULL)
+ *cellp = NULL;
+ if (ikeyp != NULL)
+ *ikeyp = NULL;
+ if (datap != NULL) {
+ *(void **)datap =
+ WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
+ *sizep = WT_K_DECODE_KEY_LEN(v);
+ return (1);
+ }
+ return (0);
+ case WT_KV_FLAG:
+ /* Encoded key/value pair: no instantiated key, no cell. */
+ if (cellp != NULL)
+ *cellp = NULL;
+ if (ikeyp != NULL)
+ *ikeyp = NULL;
+ if (datap != NULL) {
+ *(void **)datap = WT_PAGE_REF_OFFSET(
+ page, WT_KV_DECODE_KEY_OFFSET(v));
+ *sizep = WT_KV_DECODE_KEY_LEN(v);
+ return (1);
+ }
+ return (0);
+
+ }
+
+ /* Instantiated key. */
+ ikey = copy;
+ if (ikeyp != NULL)
+ *ikeyp = copy;
+ if (cellp != NULL)
+ *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
+ if (datap != NULL) {
+ *(void **)datap = WT_IKEY_DATA(ikey);
+ *sizep = ikey->size;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __wt_row_leaf_key_set_cell --
+ * Set a WT_ROW to reference an on-page row-store leaf cell.
+ */
+static inline void
+__wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) |
+ WT_CELL_FLAG;
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key_set --
+ * Set a WT_ROW to reference an on-page row-store leaf key.
+ */
+static inline void
+__wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t v;
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ v = WT_K_ENCODE_KEY_LEN(unpack->size) |
+ WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
+ WT_K_FLAG;
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_value_set --
+ * Set a WT_ROW to reference an on-page row-store leaf value.
+ */
+static inline void
+__wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
+{
+ uintptr_t key_len, key_offset, value_offset, v;
+
+ v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ if (!(v & WT_K_FLAG)) /* Already an encoded key */
+ return;
+
+ key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */
+ if (key_len > WT_KV_MAX_KEY_LEN)
+ return;
+ if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */
+ return;
+
+ key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */
+ if (key_offset > WT_KV_MAX_KEY_OFFSET)
+ return;
+ value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data);
+ if (value_offset > WT_KV_MAX_VALUE_OFFSET)
+ return;
+
+ v = WT_KV_ENCODE_KEY_LEN(key_len) |
+ WT_KV_ENCODE_VALUE_LEN(unpack->size) |
+ WT_KV_ENCODE_KEY_OFFSET(key_offset) |
+ WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
+ WT_ROW_KEY_SET(rip, v);
+}
+
+/*
+ * __wt_row_leaf_key --
+ * Set a buffer to reference a row-store leaf page key as cheaply as
+ * possible.
+ */
+static inline int
+__wt_row_leaf_key(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, int instantiate)
+{
+ void *copy;
+
+ /*
+ * A front-end for __wt_row_leaf_key_work, here to inline fast paths.
+ *
+ * The row-store key can change underfoot; explicitly take a copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * All we handle here are on-page keys (which should be a common case),
+ * and instantiated keys (which start out rare, but become more common
+ * as a leaf page is searched, instantiating prefix-compressed keys).
+ */
+ if (__wt_row_leaf_key_info(
+ page, copy, NULL, NULL, &key->data, &key->size))
+ return (0);
+
+ /*
+ * The alternative is an on-page cell with some kind of compressed or
+ * overflow key that's never been instantiated. Call the underlying
+ * worker function to figure it out.
+ */
+ return (__wt_row_leaf_key_work(session, page, rip, key, instantiate));
+}
+
+/*
+ * __wt_cursor_row_leaf_key --
+ * Set a buffer to reference a cursor-referenced row-store leaf page key.
+ */
+static inline int
+__wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key)
+{
+ WT_PAGE *page;
+ WT_ROW *rip;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * If the cursor references a WT_INSERT item, take the key from there,
+ * else take the key from the original page.
+ */
+ if (cbt->ins == NULL) {
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ rip = &page->u.row.d[cbt->slot];
+ WT_RET(__wt_row_leaf_key(session, page, rip, key, 0));
+ } else {
+ key->data = WT_INSERT_KEY(cbt->ins);
+ key->size = WT_INSERT_KEY_SIZE(cbt->ins);
+ }
+ return (0);
+}
+
+/*
+ * __wt_row_leaf_value_cell --
+ * Return a pointer to the value cell for a row-store leaf page key, or
+ * NULL if there isn't one.
+ */
+static inline WT_CELL *
+__wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack)
+{
+ WT_CELL *kcell, *vcell;
+ WT_CELL_UNPACK unpack;
+ void *copy, *key;
+ size_t size;
+
+ /* If we already have an unpacked key cell, use it. */
+ if (kpack != NULL)
+ vcell = (WT_CELL *)
+ ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack));
+ else {
+ /*
+ * The row-store key can change underfoot; explicitly take a
+ * copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * Figure out where the key is, step past it to the value cell.
+ * The test for a cell not being set tells us that we have an
+ * on-page key, otherwise we're looking at an instantiated key
+ * or on-page cell, both of which require an unpack of the key's
+ * cell to find the value cell that follows.
+ */
+ if (__wt_row_leaf_key_info(
+ page, copy, NULL, &kcell, &key, &size) && kcell == NULL)
+ vcell = (WT_CELL *)((uint8_t *)key + size);
+ else {
+ __wt_cell_unpack(kcell, &unpack);
+ vcell = (WT_CELL *)((uint8_t *)
+ unpack.cell + __wt_cell_total_len(&unpack));
+ }
+ }
+
+ return (__wt_cell_leaf_value_parse(page, vcell));
+}
+
+/*
+ * __wt_row_leaf_value --
+ * Return the value for a row-store leaf page encoded key/value pair.
+ */
+static inline int
+__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
+{
+ uintptr_t v;
+
+ /* The row-store key can change underfoot; explicitly take a copy. */
+ v = (uintptr_t)WT_ROW_KEY_COPY(rip);
+
+ /*
+ * See the comment in __wt_row_leaf_key_info for an explanation of the
+ * magic.
+ */
+ if ((v & 0x03) == WT_KV_FLAG) {
+ value->data =
+ WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
+ value->size = WT_KV_DECODE_VALUE_LEN(v);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __wt_ref_info --
+ * Return the addr/size and type triplet for a reference.
+ */
+static inline int
+__wt_ref_info(WT_SESSION_IMPL *session,
+ WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
+{
+ WT_ADDR *addr;
+ WT_CELL_UNPACK *unpack, _unpack;
+
+ addr = ref->addr;
+ unpack = &_unpack;
+
+ /*
+ * If NULL, there is no location.
+ * If off-page, the pointer references a WT_ADDR structure.
+ * If on-page, the pointer references a cell.
+ *
+ * The type is of a limited set: internal, leaf or no-overflow leaf.
+ */
+ if (addr == NULL) {
+ *addrp = NULL;
+ *sizep = 0;
+ if (typep != NULL)
+ *typep = 0;
+ } else if (__wt_off_page(ref->home, addr)) {
+ *addrp = addr->addr;
+ *sizep = addr->size;
+ if (typep != NULL)
+ switch (addr->type) {
+ case WT_ADDR_INT:
+ *typep = WT_CELL_ADDR_INT;
+ break;
+ case WT_ADDR_LEAF:
+ *typep = WT_CELL_ADDR_LEAF;
+ break;
+ case WT_ADDR_LEAF_NO:
+ *typep = WT_CELL_ADDR_LEAF_NO;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ } else {
+ __wt_cell_unpack((WT_CELL *)addr, unpack);
+ *addrp = unpack->data;
+ *sizep = unpack->size;
+ if (typep != NULL)
+ *typep = unpack->type;
+ }
+ return (0);
+}
+
+/*
+ * __wt_page_release --
+ * Release a reference to a page.
+ */
+static inline int
+__wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ int locked;
+
+ btree = S2BT(session);
+
+ /*
+ * Discard our hazard pointer. Ignore pages we don't have and the root
+ * page, which sticks in memory, regardless.
+ */
+ if (ref == NULL || __wt_ref_is_root(ref))
+ return (0);
+ page = ref->page;
+
+ /*
+ * Attempt to evict pages with the special "oldest" read generation.
+ *
+ * This is set for pages that grow larger than the configured
+ * memory_page_max setting, and when we are attempting to scan without
+ * trashing the cache.
+ *
+ * Skip this if eviction is disabled for this operation or this tree,
+ * or if there is no chance of eviction succeeding for dirty pages due
+ * to a checkpoint or because we've already tried writing this page and
+ * it contains an update that isn't stable.
+ */
+ if (LF_ISSET(WT_READ_NO_EVICT) ||
+ page->read_gen != WT_READGEN_OLDEST ||
+ F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
+ (__wt_page_is_modified(page) && (btree->checkpointing ||
+ !__wt_txn_visible_all(session, page->modify->first_dirty_txn))))
+ return (__wt_hazard_clear(session, page));
+
+ /*
+ * Take some care with order of operations: if we release the hazard
+ * reference without first locking the page, it could be evicted in
+ * between.
+ */
+ locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED);
+ WT_TRET(__wt_hazard_clear(session, page));
+ if (!locked)
+ return (ret);
+
+ (void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
+ if ((ret = __wt_evict_page(session, ref)) == 0)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
+ else {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
+ if (ret == EBUSY)
+ ret = 0;
+ }
+ (void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+
+ return (ret);
+}
+
+/*
+ * __wt_page_swap_func --
+ * Swap one page's hazard pointer for another one when hazard pointer
+ * coupling up/down the tree.
+ */
+static inline int
+__wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
+ WT_REF *want, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_DECL_RET;
+ int acquired;
+
+ /*
+ * This function is here to simplify the error handling during hazard
+ * pointer coupling so we never leave a hazard pointer dangling. The
+ * assumption is we're holding a hazard pointer on "held", and want to
+ * acquire a hazard pointer on "want", releasing the hazard pointer on
+ * "held" when we're done.
+ */
+ ret = __wt_page_in_func(session, want, flags
+#ifdef HAVE_DIAGNOSTIC
+ , file, line
+#endif
+ );
+
+ /* An expected failure: WT_NOTFOUND when doing a cache-only read. */
+ if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
+ return (WT_NOTFOUND);
+
+ /* An expected failure: WT_RESTART */
+ if (ret == WT_RESTART)
+ return (WT_RESTART);
+
+ /* Discard the original held page. */
+ acquired = ret == 0;
+ WT_TRET(__wt_page_release(session, held, flags));
+
+ /*
+ * If there was an error discarding the original held page, discard
+ * the acquired page too, keeping it is never useful.
+ */
+ if (acquired && ret != 0)
+ WT_TRET(__wt_page_release(session, want, flags));
+ return (ret);
+}
+
+/*
+ * __wt_page_hazard_check --
+ * Return if there's a hazard pointer to the page in the system.
+ */
+static inline WT_HAZARD *
+__wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_HAZARD *hp;
+ WT_SESSION_IMPL *s;
+ uint32_t i, hazard_size, session_cnt;
+
+ conn = S2C(session);
+
+ /*
+ * No lock is required because the session array is fixed size, but it
+ * may contain inactive entries. We must review any active session
+ * that might contain a hazard pointer, so insert a barrier before
+ * reading the active session count. That way, no matter what sessions
+ * come or go, we'll check the slots for all of the sessions that could
+ * have been active when we started our check.
+ */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
+ if (!s->active)
+ continue;
+ WT_ORDERED_READ(hazard_size, s->hazard_size);
+ for (hp = s->hazard; hp < s->hazard + hazard_size; ++hp)
+ if (hp->page == page)
+ return (hp);
+ }
+ return (NULL);
+}
+
+/*
+ * __wt_skip_choose_depth --
+ * Randomly choose a depth for a skiplist insert.
+ */
+static inline u_int
+__wt_skip_choose_depth(WT_SESSION_IMPL *session)
+{
+ u_int d;
+
+ for (d = 1; d < WT_SKIP_MAXDEPTH &&
+ __wt_random(session->rnd) < WT_SKIP_PROBABILITY; d++)
+ ;
+ return (d);
+}
+
+/*
+ * __wt_btree_size_overflow --
+ * Check if the size of an in-memory tree with a single leaf page is over
+ * a specified maximum. If called on anything other than a simple tree with a
+ * single leaf page, returns true so the calling code will switch to a new tree.
+ */
+static inline int
+__wt_btree_size_overflow(WT_SESSION_IMPL *session, uint64_t maxsize)
+{
+ WT_BTREE *btree;
+ WT_PAGE *child, *root;
+ WT_PAGE_INDEX *pindex;
+ WT_REF *first;
+
+ btree = S2BT(session);
+ root = btree->root.page;
+
+ /* Check for a non-existent tree. */
+ if (root == NULL)
+ return (0);
+
+ /* A tree that can be evicted always requires a switch. */
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (1);
+
+ /* Check for a tree with a single leaf page. */
+ pindex = WT_INTL_INDEX_COPY(root);
+ if (pindex->entries != 1) /* > 1 child page, switch */
+ return (1);
+
+ first = pindex->index[0];
+ if (first->state != WT_REF_MEM) /* no child page, ignore */
+ return (0);
+
+ /*
+ * We're reaching down into the page without a hazard pointer, but
+ * that's OK because we know that no-eviction is set and so the page
+ * cannot disappear.
+ */
+ child = first->page;
+ if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */
+ return (1);
+
+ return (child->memory_footprint > maxsize);
+}
+
+/*
+ * __wt_lex_compare --
+ * Lexicographic comparison routine.
+ *
+ * Returns:
+ * < 0 if user_item is lexicographically < tree_item
+ * = 0 if user_item is lexicographically = tree_item
+ * > 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison func.
+ */
+static inline int
+__wt_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
+{
+ const uint8_t *userp, *treep;
+ size_t len, usz, tsz;
+
+ usz = user_item->size;
+ tsz = tree_item->size;
+ len = WT_MIN(usz, tsz);
+
+ for (userp = user_item->data, treep = tree_item->data;
+ len > 0;
+ --len, ++userp, ++treep)
+ if (*userp != *treep)
+ return (*userp < *treep ? -1 : 1);
+
+ /* Contents are equal up to the smallest length. */
+ return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+}
+
+/*
+ * __wt_compare --
+ * The same as __wt_lex_compare, but using the application's collator
+ * function when configured.
+ */
+static inline int
+__wt_compare(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
+ const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp)
+{
+ if (collator == NULL) {
+ *cmpp = __wt_lex_compare(user_item, tree_item);
+ return (0);
+ }
+ return (collator->compare(
+ collator, &session->iface, user_item, tree_item, cmpp));
+}
+
+/*
+ * __wt_lex_compare_skip --
+ * Lexicographic comparison routine, skipping leading bytes.
+ *
+ * Returns:
+ * < 0 if user_item is lexicographically < tree_item
+ * = 0 if user_item is lexicographically = tree_item
+ * > 0 if user_item is lexicographically > tree_item
+ *
+ * We use the names "user" and "tree" so it's clear in the btree code which
+ * the application is looking at when we call its comparison func.
+ */
+static inline int
+__wt_lex_compare_skip(
+ const WT_ITEM *user_item, const WT_ITEM *tree_item, size_t *matchp)
+{
+ const uint8_t *userp, *treep;
+ size_t len, usz, tsz;
+
+ usz = user_item->size;
+ tsz = tree_item->size;
+ len = WT_MIN(usz, tsz) - *matchp;
+
+ for (userp = (uint8_t *)user_item->data + *matchp,
+ treep = (uint8_t *)tree_item->data + *matchp;
+ len > 0;
+ --len, ++userp, ++treep, ++*matchp)
+ if (*userp != *treep)
+ return (*userp < *treep ? -1 : 1);
+
+ /* Contents are equal up to the smallest length. */
+ return ((usz == tsz) ? 0 : (usz < tsz) ? -1 : 1);
+}
+
+/*
+ * __wt_compare_skip --
+ * The same as __wt_lex_compare_skip, but using the application's collator
+ * function when configured.
+ */
+static inline int
+__wt_compare_skip(WT_SESSION_IMPL *session, WT_COLLATOR *collator,
+ const WT_ITEM *user_item, const WT_ITEM *tree_item, int *cmpp,
+ size_t *matchp)
+{
+ if (collator == NULL) {
+ *cmpp = __wt_lex_compare_skip(user_item, tree_item, matchp);
+ return (0);
+ }
+ return (collator->compare(
+ collator, &session->iface, user_item, tree_item, cmpp));
+}
diff --git a/src/third_party/wiredtiger/src/include/buf.i b/src/third_party/wiredtiger/src/include/buf.i
new file mode 100644
index 00000000000..09bee9ff831
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/buf.i
@@ -0,0 +1,133 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_buf_grow --
+ * Grow a buffer that may be in-use, and ensure that all data is local to
+ * the buffer.
+ */
+static inline int
+__wt_buf_grow(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ return (size > buf->memsize || !WT_DATA_IN_ITEM(buf) ?
+ __wt_buf_grow_worker(session, buf, size) : 0);
+}
+
+/*
+ * __wt_buf_extend --
+ * Grow a buffer that's currently in-use.
+ */
+static inline int
+__wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ /*
+ * The difference between __wt_buf_grow and __wt_buf_extend is that the
+ * latter is expected to be called repeatedly for the same buffer, and
+ * so grows the buffer exponentially to avoid repeated costly calls to
+ * realloc.
+ */
+ return (size > buf->memsize ?
+ __wt_buf_grow(session, buf, WT_MAX(size, 2 * buf->memsize)) : 0);
+}
+
+/*
+ * __wt_buf_init --
+ * Initialize a buffer at a specific size.
+ */
+static inline int
+__wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ buf->data = buf->mem;
+ buf->size = 0; /* Clear existing data length */
+ WT_RET(__wt_buf_grow(session, buf, size));
+
+ return (0);
+}
+
+/*
+ * __wt_buf_initsize --
+ * Initialize a buffer at a specific size, and set the data length.
+ */
+static inline int
+__wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ buf->data = buf->mem;
+ buf->size = 0; /* Clear existing data length */
+ WT_RET(__wt_buf_grow(session, buf, size));
+ buf->size = size; /* Set the data length. */
+
+ return (0);
+}
+
+/*
+ * __wt_buf_set --
+ * Set the contents of the buffer.
+ */
+static inline int
+__wt_buf_set(
+ WT_SESSION_IMPL *session, WT_ITEM *buf, const void *data, size_t size)
+{
+ /* Ensure the buffer is large enough. */
+ WT_RET(__wt_buf_initsize(session, buf, size));
+
+ /* Copy the data, allowing for overlapping strings. */
+ memmove(buf->mem, data, size);
+
+ return (0);
+}
+
+/*
+ * __wt_buf_setstr --
+ * Set the contents of the buffer to a NUL-terminated string.
+ */
+static inline int
+__wt_buf_setstr(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *s)
+{
+ return (__wt_buf_set(session, buf, s, strlen(s) + 1));
+}
+
+/*
+ * __wt_buf_set_printable --
+ * Set the contents of the buffer to a printable representation of a
+ * byte string.
+ */
+static inline int
+__wt_buf_set_printable(
+ WT_SESSION_IMPL *session, WT_ITEM *buf, const void *from_arg, size_t size)
+{
+ return (__wt_raw_to_esc_hex(session, from_arg, size, buf));
+}
+
+/*
+ * __wt_buf_free --
+ * Free a buffer.
+ */
+static inline void
+__wt_buf_free(WT_SESSION_IMPL *session, WT_ITEM *buf)
+{
+ __wt_free(session, buf->mem);
+
+ memset(buf, 0, sizeof(WT_ITEM));
+}
+
+/*
+ * __wt_scr_free --
+ * Release a scratch buffer.
+ */
+static inline void
+__wt_scr_free(WT_ITEM **bufp)
+{
+ WT_ITEM *buf;
+
+ if ((buf = *bufp) != NULL) {
+ *bufp = NULL;
+
+ buf->data = NULL;
+ buf->size = 0;
+ F_CLR(buf, WT_ITEM_INUSE);
+ }
+}
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
new file mode 100644
index 00000000000..b7dbd8401a9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Tuning constants: I hesitate to call this tuning, but we want to review some
+ * number of pages from each file's in-memory tree for each page we evict.
+ */
+#define WT_EVICT_INT_SKEW (1<<20) /* Prefer leaf pages over internal
+ pages by this many increments of the
+ read generation. */
+#define WT_EVICT_WALK_PER_FILE 10 /* Pages to visit per file */
+#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */
+#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
+
+#define WT_EVICT_PASS_AGGRESSIVE 0x01
+#define WT_EVICT_PASS_ALL 0x02
+#define WT_EVICT_PASS_DIRTY 0x04
+
+/*
+ * WT_EVICT_ENTRY --
+ * Encapsulation of an eviction candidate.
+ */
+struct __wt_evict_entry {
+ WT_BTREE *btree; /* Enclosing btree object */
+ WT_REF *ref; /* Page to flush/evict */
+};
+
+/*
+ * WT_EVICT_WORKER --
+ * Encapsulation of an eviction worker thread.
+ */
+
+struct __wt_evict_worker {
+ WT_SESSION_IMPL *session;
+ u_int id;
+ wt_thread_t tid;
+#define WT_EVICT_WORKER_RUN 0x01
+ uint32_t flags;
+};
+
+/*
+ * WiredTiger cache structure.
+ */
+struct __wt_cache {
+ /*
+ * Different threads read/write pages to/from the cache and create pages
+ * in the cache, so we cannot know precisely how much memory is in use
+ * at any specific time. However, even though the values don't have to
+ * be exact, they can't be garbage, we track what comes in and what goes
+ * out and calculate the difference as needed.
+ */
+ uint64_t bytes_inmem; /* Bytes/pages in memory */
+ uint64_t pages_inmem;
+ uint64_t bytes_evict; /* Bytes/pages discarded by eviction */
+ uint64_t pages_evict;
+ uint64_t bytes_dirty; /* Bytes/pages currently dirty */
+ uint64_t pages_dirty;
+
+ /*
+ * Read information.
+ */
+ uint64_t read_gen; /* Page read generation (LRU) */
+
+ /*
+ * Eviction thread information.
+ */
+ WT_CONDVAR *evict_cond; /* Eviction server condition */
+ WT_SPINLOCK evict_lock; /* Eviction LRU queue */
+ WT_SPINLOCK evict_walk_lock; /* Eviction walk location */
+ /* Condition signalled when the eviction server populates the queue */
+ WT_CONDVAR *evict_waiter_cond;
+
+ u_int eviction_trigger; /* Percent to trigger eviction */
+ u_int eviction_target; /* Percent to end eviction */
+ u_int eviction_dirty_target; /* Percent to allow dirty */
+
+ /*
+ * LRU eviction list information.
+ */
+ WT_EVICT_ENTRY *evict; /* LRU pages being tracked */
+ WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */
+ uint32_t evict_candidates; /* LRU list pages to evict */
+ uint32_t evict_entries; /* LRU entries in the queue */
+ volatile uint32_t evict_max; /* LRU maximum eviction slot used */
+ uint32_t evict_slots; /* LRU list eviction slots */
+ WT_DATA_HANDLE
+ *evict_file_next; /* LRU next file to search */
+
+ /*
+ * Sync/flush request information.
+ */
+ volatile uint64_t sync_request; /* File sync requests */
+ volatile uint64_t sync_complete;/* File sync requests completed */
+
+ /*
+ * Cache pool information.
+ */
+ uint64_t cp_saved_evict; /* Evict count from last pass */
+ uint64_t cp_current_evict; /* Evict count from current pass */
+ uint32_t cp_skip_count; /* Post change stabilization */
+ uint64_t cp_reserved; /* Base size for this cache */
+ WT_SESSION_IMPL *cp_session; /* May be used for cache management */
+ wt_thread_t cp_tid; /* Thread ID for cache pool manager */
+
+ /*
+ * Flags.
+ */
+#define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */
+#define WT_CACHE_POOL_RUN 0x02 /* Cache pool thread running */
+#define WT_EVICT_ACTIVE 0x04 /* Eviction server is active */
+#define WT_EVICT_CLEAR_WALKS 0x08 /* Clear eviction walks */
+#define WT_EVICT_NO_PROGRESS 0x10 /* Check if pages are being evicted */
+#define WT_EVICT_STUCK 0x20 /* Eviction server is stuck */
+ uint32_t flags;
+};
+
+/*
+ * WT_CACHE_POOL --
+ * A structure that represents a shared cache.
+ */
+struct __wt_cache_pool {
+ WT_SPINLOCK cache_pool_lock;
+ WT_CONDVAR *cache_pool_cond;
+ const char *name;
+ uint64_t size;
+ uint64_t chunk;
+ uint64_t currently_used;
+ uint32_t refs; /* Reference count for structure. */
+ /* Locked: List of connections participating in the cache pool. */
+ TAILQ_HEAD(__wt_cache_pool_qh, __wt_connection_impl) cache_pool_qh;
+
+#define WT_CACHE_POOL_MANAGED 0x01 /* Cache pool has a manager thread */
+#define WT_CACHE_POOL_ACTIVE 0x02 /* Cache pool is active */
+ uint8_t flags_atomic;
+};
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
new file mode 100644
index 00000000000..fdb7302f4a8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_eviction_check --
+ * Wake the eviction server if necessary.
+ */
+static inline int
+__wt_eviction_check(WT_SESSION_IMPL *session, int *fullp, int wake)
+{
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ uint64_t bytes_inuse, bytes_max, dirty_inuse;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /*
+ * If we're over the maximum cache, shut out reads (which include page
+ * allocations) until we evict to back under the maximum cache.
+ * Eviction will keep pushing out pages so we don't run on the edge all
+ * the time. Avoid division by zero if the cache size has not yet been
+ * in a shared cache.
+ */
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ dirty_inuse = cache->bytes_dirty;
+ bytes_max = conn->cache_size + 1;
+
+ /* Calculate the cache full percentage. */
+ *fullp = (int)((100 * bytes_inuse) / bytes_max);
+
+ /* Wake eviction when we're over the trigger cache size. */
+ if (wake &&
+ (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100 ||
+ dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100))
+ WT_RET(__wt_evict_server_wake(session));
+ return (0);
+}
+
+/*
+ * __wt_session_can_wait --
+ * Return if a session available for a potentially slow operation.
+ */
+static inline int
+__wt_session_can_wait(WT_SESSION_IMPL *session)
+{
+ /*
+ * Return if a session available for a potentially slow operation;
+ * for example, used by the block manager in the case of flushing
+ * the system cache.
+ */
+ if (!F_ISSET(session, WT_SESSION_CAN_WAIT))
+ return (0);
+
+ /*
+ * LSM sets the no-cache-check flag when holding the LSM tree lock,
+ * in that case, or when holding the schema lock, we don't want to
+ * highjack the thread for eviction.
+ */
+ if (F_ISSET(session,
+ WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * __wt_cache_full_check --
+ * Wait for there to be space in the cache before a read or update.
+ */
+static inline int
+__wt_cache_full_check(WT_SESSION_IMPL *session)
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+ int busy, count, full;
+
+ /*
+ * LSM sets the no-cache-check flag when holding the LSM tree lock,
+ * in that case, or when holding the schema lock, we don't want to
+ * highjack the thread for eviction.
+ */
+ if (F_ISSET(session,
+ WT_SESSION_NO_CACHE_CHECK | WT_SESSION_SCHEMA_LOCKED))
+ return (0);
+
+ /*
+ * Threads operating on trees that cannot be evicted are ignored,
+ * mostly because they're not contributing to the problem.
+ */
+ if ((btree = S2BT_SAFE(session)) != NULL &&
+ F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (0);
+
+ /*
+ * Only wake the eviction server the first time through here (if the
+ * cache is too full).
+ *
+ * If the cache is less than 95% full, no work to be done.
+ */
+ WT_RET(__wt_eviction_check(session, &full, 1));
+ if (full < 95)
+ return (0);
+
+ /*
+ * If we are at the API boundary and the cache is more than 95% full,
+ * try to evict at least one page before we start an operation. This
+ * helps with some eviction-dominated workloads.
+ *
+ * If the current transaction is keeping the oldest ID pinned, it is in
+ * the middle of an operation. This may prevent the oldest ID from
+ * moving forward, leading to deadlock, so only evict what we can.
+ * Otherwise, we are at a transaction boundary and we can work harder
+ * to make sure there is free space in the cache.
+ */
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
+ busy = txn_state->id != WT_TXN_NONE ||
+ session->nhazard > 0 ||
+ (txn_state->snap_min != WT_TXN_NONE &&
+ txn_global->current != txn_global->oldest_id);
+ if (busy && full < 100)
+ return (0);
+ count = busy ? 1 : 10;
+
+ for (;;) {
+ switch (ret = __wt_evict_lru_page(session, 1)) {
+ case 0:
+ if (--count == 0)
+ return (0);
+ break;
+ case EBUSY:
+ continue;
+ case WT_NOTFOUND:
+ break;
+ default:
+ return (ret);
+ }
+
+ WT_RET(__wt_eviction_check(session, &full, 0));
+ if (full < 100)
+ return (0);
+ else if (ret == 0)
+ continue;
+
+ /*
+ * The cache is still full and no pages were found in the queue
+ * to evict. If this transaction is the one holding back the
+ * oldest ID, we can't wait forever. We'll block next time we
+ * are not busy.
+ */
+ if (busy) {
+ __wt_txn_update_oldest(session);
+ if (txn_state->id == txn_global->oldest_id ||
+ txn_state->snap_min == txn_global->oldest_id)
+ return (0);
+ }
+
+ /* Wait for the queue to re-populate before trying again. */
+ WT_RET(__wt_cond_wait(session,
+ S2C(session)->cache->evict_waiter_cond, 100000));
+
+ /* Check if things have changed so that we are busy. */
+ if (!busy && txn_state->snap_min != WT_TXN_NONE &&
+ txn_global->current != txn_global->oldest_id)
+ busy = count = 1;
+ }
+}
diff --git a/src/third_party/wiredtiger/src/include/cell.i b/src/third_party/wiredtiger/src/include/cell.i
new file mode 100644
index 00000000000..42c7c07a30c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cell.i
@@ -0,0 +1,816 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_CELL --
+ * Variable-length cell type.
+ *
+ * Pages containing variable-length keys or values data (the WT_PAGE_ROW_INT,
+ * WT_PAGE_ROW_LEAF, WT_PAGE_COL_INT and WT_PAGE_COL_VAR page types), have
+ * cells after the page header.
+ *
+ * There are 4 basic cell types: keys and data (each of which has an overflow
+ * form), deleted cells and off-page references. The cell is usually followed
+ * by additional data, varying by type: a key or data cell is followed by a set
+ * of bytes, an address cookie follows overflow or off-page cells.
+ *
+ * Deleted cells are place-holders for column-store files, where entries cannot
+ * be removed in order to preserve the record count.
+ *
+ * Here's the cell use by page type:
+ *
+ * WT_PAGE_ROW_INT (row-store internal page):
+ * Keys and offpage-reference pairs (a WT_CELL_KEY or WT_CELL_KEY_OVFL
+ * cell followed by a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_ROW_LEAF (row-store leaf page):
+ * Keys with optional data cells (a WT_CELL_KEY or WT_CELL_KEY_OVFL cell,
+ * normally followed by a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell).
+ *
+ * WT_PAGE_ROW_LEAF pages optionally prefix-compress keys, using a single
+ * byte count immediately following the cell.
+ *
+ * WT_PAGE_COL_INT (Column-store internal page):
+ * Off-page references (a WT_CELL_ADDR_XXX cell).
+ *
+ * WT_PAGE_COL_VAR (Column-store leaf page storing variable-length cells):
+ * Data cells (a WT_CELL_{VALUE,VALUE_COPY,VALUE_OVFL} cell), or deleted
+ * cells (a WT_CELL_DEL cell).
+ *
+ * Each cell starts with a descriptor byte:
+ *
+ * Bits 1 and 2 are reserved for "short" key and value cells (that is, a cell
+ * carrying data less than 64B, where we can store the data length in the cell
+ * descriptor byte):
+ * 0x00 Not a short key/data cell
+ * 0x01 Short key cell
+ * 0x10 Short key cell, with a following prefix-compression byte
+ * 0x11 Short value cell
+ * In these cases, the other 6 bits of the descriptor byte are the data length.
+ *
+ * Bit 3 marks an 8B packed, uint64_t value following the cell description byte.
+ * (A run-length counter or a record number for variable-length column store.)
+ *
+ * Bit 4 is unused.
+ *
+ * Bits 5-8 are cell "types".
+ */
+#define WT_CELL_KEY_SHORT 0x01 /* Short key */
+#define WT_CELL_KEY_SHORT_PFX 0x02 /* Short key with prefix byte */
+#define WT_CELL_VALUE_SHORT 0x03 /* Short data */
+#define WT_CELL_SHORT_TYPE(v) ((v) & 0x03U)
+
+#define WT_CELL_SHORT_MAX 63 /* Maximum short key/value */
+#define WT_CELL_SHORT_SHIFT 2 /* Shift for short key/value */
+
+#define WT_CELL_64V 0x04 /* Associated value */
+
+/*
+ * We could use bit 4 as a single bit (similar to bit 3), or as a type bit in a
+ * backward compatible way by adding bit 4 to the type mask and adding new types
+ * that incorporate it.
+ */
+#define WT_CELL_UNUSED_BIT4 0x08 /* Unused */
+
+/*
+ * WT_CELL_ADDR_INT is an internal block location, WT_CELL_ADDR_LEAF is a leaf
+ * block location, and WT_CELL_ADDR_LEAF_NO is a leaf block location where the
+ * page has no overflow items. (The goal is to speed up truncation as we don't
+ * have to read pages without overflow items in order to delete them. Note,
+ * WT_CELL_ADDR_LEAF_NO is not guaranteed to be set on every page without
+ * overflow items, the only guarantee is that if set, the page has no overflow
+ * items.)
+ *
+ * WT_CELL_VALUE_COPY is a reference to a previous cell on the page, supporting
+ * value dictionaries: if the two values are the same, we only store them once
+ * and have the second and subsequent use reference the original.
+ */
+#define WT_CELL_ADDR_DEL (0) /* Address: deleted */
+#define WT_CELL_ADDR_INT (1 << 4) /* Address: internal */
+#define WT_CELL_ADDR_LEAF (2 << 4) /* Address: leaf */
+#define WT_CELL_ADDR_LEAF_NO (3 << 4) /* Address: leaf no overflow */
+#define WT_CELL_DEL (4 << 4) /* Deleted value */
+#define WT_CELL_KEY (5 << 4) /* Key */
+#define WT_CELL_KEY_OVFL (6 << 4) /* Overflow key */
+#define WT_CELL_KEY_OVFL_RM (12 << 4) /* Overflow key (removed) */
+#define WT_CELL_KEY_PFX (7 << 4) /* Key with prefix byte */
+#define WT_CELL_VALUE (8 << 4) /* Value */
+#define WT_CELL_VALUE_COPY (9 << 4) /* Value copy */
+#define WT_CELL_VALUE_OVFL (10 << 4) /* Overflow value */
+#define WT_CELL_VALUE_OVFL_RM (11 << 4) /* Overflow value (removed) */
+
+#define WT_CELL_TYPE_MASK (0x0fU << 4) /* Maximum 16 cell types */
+#define WT_CELL_TYPE(v) ((v) & WT_CELL_TYPE_MASK)
+
+/*
+ * When we aren't able to create a short key or value (and, in the case of a
+ * value, there's no associated RLE), the key or value is at least 64B, else
+ * we'd have been able to store it as a short cell. Decrement/Increment the
+ * size before storing it, in the hopes that relatively small key/value sizes
+ * will pack into a single byte instead of two bytes.
+ */
+#define WT_CELL_SIZE_ADJUST 64
+
+/*
+ * WT_CELL --
+ * Variable-length, on-page cell header.
+ */
+struct __wt_cell {
+ /*
+ * Maximum of 16 bytes:
+ * 1: cell descriptor byte
+ * 1: prefix compression count
+ * 9: associated 64-bit value (uint64_t encoding, max 9 bytes)
+ * 5: data length (uint32_t encoding, max 5 bytes)
+ *
+ * This calculation is pessimistic: the prefix compression count and
+ * 64V value overlap, the 64V value and data length are optional.
+ */
+ uint8_t __chunk[1 + 1 + WT_INTPACK64_MAXSIZE + WT_INTPACK32_MAXSIZE];
+};
+
+/*
+ * WT_CELL_UNPACK --
+ * Unpacked cell.
+ */
+struct __wt_cell_unpack {
+ WT_CELL *cell; /* Cell's disk image address */
+
+ uint64_t v; /* RLE count or recno */
+
+ /*
+ * !!!
+ * The size and __len fields are reasonably type size_t; don't change
+ * the type, performance drops significantly if they're type size_t.
+ */
+ const void *data; /* Data */
+ uint32_t size; /* Data size */
+
+ uint32_t __len; /* Cell + data length (usually) */
+
+ uint8_t prefix; /* Cell prefix length */
+
+ uint8_t raw; /* Raw cell type (include "shorts") */
+ uint8_t type; /* Cell type */
+
+ uint8_t ovfl; /* boolean: cell is an overflow */
+};
+
+/*
+ * WT_CELL_FOREACH --
+ * Walk the cells on a page.
+ */
+#define WT_CELL_FOREACH(btree, dsk, cell, unpack, i) \
+ for ((cell) = \
+ WT_PAGE_HEADER_BYTE(btree, dsk), (i) = (dsk)->u.entries; \
+ (i) > 0; \
+ (cell) = (WT_CELL *)((uint8_t *)(cell) + (unpack)->__len), --(i))
+
+/*
+ * __wt_cell_pack_addr --
+ * Pack an address cell.
+ */
+static inline size_t
+__wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+
+ if (recno == 0)
+ cell->__chunk[0] = cell_type; /* Type */
+ else {
+ cell->__chunk[0] = cell_type | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, recno); /* Record number */
+ }
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_data --
+ * Set a data item's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_data(WT_CELL *cell, uint64_t rle, size_t size)
+{
+ uint8_t byte, *p;
+
+ /*
+ * Short data cells without run-length encoding have 6 bits of data
+ * length in the descriptor byte.
+ */
+ if (rle < 2 && size <= WT_CELL_SHORT_MAX) {
+ byte = (uint8_t)size; /* Type + length */
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_VALUE_SHORT;
+ return (1);
+ }
+
+ p = cell->__chunk + 1;
+ if (rle < 2) {
+ size -= WT_CELL_SIZE_ADJUST;
+ cell->__chunk[0] = WT_CELL_VALUE; /* Type */
+ } else {
+ cell->__chunk[0] = WT_CELL_VALUE | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ }
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_data_match --
+ * Return if two items would have identical WT_CELLs (except for any RLE).
+ */
+static inline int
+__wt_cell_pack_data_match(
+ WT_CELL *page_cell, WT_CELL *val_cell, const uint8_t *val_data, int *matchp)
+{
+ const uint8_t *a, *b;
+ uint64_t av, bv;
+ int rle;
+
+ *matchp = 0; /* Default to no-match */
+
+ /*
+ * This is a special-purpose function used by reconciliation to support
+ * dictionary lookups. We're passed an on-page cell and a created cell
+ * plus a chunk of data we're about to write on the page, and we return
+ * if they would match on the page. The column-store comparison ignores
+ * the RLE because the copied cell will have its own RLE.
+ */
+ a = (uint8_t *)page_cell;
+ b = (uint8_t *)val_cell;
+
+ if (WT_CELL_SHORT_TYPE(a[0]) == WT_CELL_VALUE_SHORT) {
+ av = a[0] >> WT_CELL_SHORT_SHIFT;
+ ++a;
+ } else if (WT_CELL_TYPE(a[0]) == WT_CELL_VALUE) {
+ rle = a[0] & WT_CELL_64V ? 1 : 0; /* Skip any RLE */
+ ++a;
+ if (rle)
+ WT_RET(__wt_vunpack_uint(&a, 0, &av));
+ WT_RET(__wt_vunpack_uint(&a, 0, &av)); /* Length */
+ } else
+ return (0);
+
+ if (WT_CELL_SHORT_TYPE(b[0]) == WT_CELL_VALUE_SHORT) {
+ bv = b[0] >> WT_CELL_SHORT_SHIFT;
+ ++b;
+ } else if (WT_CELL_TYPE(b[0]) == WT_CELL_VALUE) {
+ rle = b[0] & WT_CELL_64V ? 1 : 0; /* Skip any RLE */
+ ++b;
+ if (rle)
+ WT_RET(__wt_vunpack_uint(&b, 0, &bv));
+ WT_RET(__wt_vunpack_uint(&b, 0, &bv)); /* Length */
+ } else
+ return (0);
+
+ if (av == bv)
+ *matchp = memcmp(a, val_data, av) == 0 ? 1 : 0;
+ return (0);
+}
+
+/*
+ * __wt_cell_pack_copy --
+ * Write a copy value cell.
+ */
+static inline size_t
+__wt_cell_pack_copy(WT_CELL *cell, uint64_t rle, uint64_t v)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+
+ if (rle < 2) /* Type */
+ cell->__chunk[0] = WT_CELL_VALUE_COPY;
+ else { /* Type */
+ cell->__chunk[0] = WT_CELL_VALUE_COPY | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ }
+ (void)__wt_vpack_uint(&p, 0, v); /* Copy offset */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_del --
+ * Write a deleted value cell.
+ */
+static inline size_t
+__wt_cell_pack_del(WT_CELL *cell, uint64_t rle)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+ if (rle < 2) { /* Type */
+ cell->__chunk[0] = WT_CELL_DEL;
+ return (1);
+ }
+ /* Type */
+ cell->__chunk[0] = WT_CELL_DEL | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_int_key --
+ * Set a row-store internal page key's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_int_key(WT_CELL *cell, size_t size)
+{
+ uint8_t byte, *p;
+
+ /* Short keys have 6 bits of data length in the descriptor byte. */
+ if (size <= WT_CELL_SHORT_MAX) {
+ byte = (uint8_t)size;
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT;
+ return (1);
+ }
+
+ cell->__chunk[0] = WT_CELL_KEY; /* Type */
+ p = cell->__chunk + 1;
+
+ size -= WT_CELL_SIZE_ADJUST;
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_leaf_key --
+ * Set a row-store leaf page key's WT_CELL contents.
+ */
+static inline size_t
+__wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size)
+{
+ uint8_t byte, *p;
+
+ /* Short keys have 6 bits of data length in the descriptor byte. */
+ if (size <= WT_CELL_SHORT_MAX) {
+ if (prefix == 0) {
+ byte = (uint8_t)size; /* Type + length */
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT;
+ return (1);
+ } else {
+ byte = (uint8_t)size; /* Type + length */
+ cell->__chunk[0] =
+ (byte << WT_CELL_SHORT_SHIFT) |
+ WT_CELL_KEY_SHORT_PFX;
+ cell->__chunk[1] = prefix; /* Prefix */
+ return (2);
+ }
+ }
+
+ if (prefix == 0) {
+ cell->__chunk[0] = WT_CELL_KEY; /* Type */
+ p = cell->__chunk + 1;
+ } else {
+ cell->__chunk[0] = WT_CELL_KEY_PFX; /* Type */
+ cell->__chunk[1] = prefix; /* Prefix */
+ p = cell->__chunk + 2;
+ }
+
+ size -= WT_CELL_SIZE_ADJUST;
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_pack_ovfl --
+ * Pack an overflow cell.
+ */
+static inline size_t
+__wt_cell_pack_ovfl(WT_CELL *cell, uint8_t type, uint64_t rle, size_t size)
+{
+ uint8_t *p;
+
+ p = cell->__chunk + 1;
+ if (rle < 2) /* Type */
+ cell->__chunk[0] = type;
+ else {
+ cell->__chunk[0] = type | WT_CELL_64V;
+ (void)__wt_vpack_uint(&p, 0, rle); /* RLE */
+ }
+ (void)__wt_vpack_uint(&p, 0, (uint64_t)size); /* Length */
+ return (WT_PTRDIFF(p, cell));
+}
+
+/*
+ * __wt_cell_rle --
+ * Return the cell's RLE value.
+ */
+static inline uint64_t
+__wt_cell_rle(WT_CELL_UNPACK *unpack)
+{
+ /*
+ * Any item with only 1 occurrence is stored with an RLE of 0, that is,
+ * without any RLE at all. This code is a single place to handle that
+ * correction, for simplicity.
+ */
+ return (unpack->v < 2 ? 1 : unpack->v);
+}
+
+/*
+ * __wt_cell_total_len --
+ * Return the cell's total length, including data.
+ */
+static inline size_t
+__wt_cell_total_len(WT_CELL_UNPACK *unpack)
+{
+ /*
+ * The length field is specially named because it's dangerous to use it:
+ * it represents the length of the current cell (normally used for the
+ * loop that walks through cells on the page), but occasionally we want
+ * to copy a cell directly from the page, and what we need is the cell's
+ * total length. The problem is dictionary-copy cells, because in that
+ * case, the __len field is the length of the current cell, not the cell
+ * for which we're returning data. To use the __len field, you must be
+ * sure you're not looking at a copy cell.
+ */
+ return (unpack->__len);
+}
+
+/*
+ * __wt_cell_type --
+ * Return the cell's type (collapsing special types).
+ */
+static inline u_int
+__wt_cell_type(WT_CELL *cell)
+{
+ u_int type;
+
+ switch (WT_CELL_SHORT_TYPE(cell->__chunk[0])) {
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_KEY_SHORT_PFX:
+ return (WT_CELL_KEY);
+ case WT_CELL_VALUE_SHORT:
+ return (WT_CELL_VALUE);
+ }
+
+ switch (type = WT_CELL_TYPE(cell->__chunk[0])) {
+ case WT_CELL_KEY_PFX:
+ return (WT_CELL_KEY);
+ case WT_CELL_KEY_OVFL_RM:
+ return (WT_CELL_KEY_OVFL);
+ case WT_CELL_VALUE_OVFL_RM:
+ return (WT_CELL_VALUE_OVFL);
+ }
+ return (type);
+}
+
+/*
+ * __wt_cell_type_raw --
+ * Return the cell's type.
+ */
+static inline u_int
+__wt_cell_type_raw(WT_CELL *cell)
+{
+ return (WT_CELL_SHORT_TYPE(cell->__chunk[0]) == 0 ?
+ WT_CELL_TYPE(cell->__chunk[0]) :
+ WT_CELL_SHORT_TYPE(cell->__chunk[0]));
+}
+
+/*
+ * __wt_cell_type_reset --
+ * Reset the cell's type.
+ */
+static inline void
+__wt_cell_type_reset(
+ WT_SESSION_IMPL *session, WT_CELL *cell, u_int old_type, u_int new_type)
+{
+ /*
+ * For all current callers of this function, this should happen once
+ * and only once, assert we're setting what we think we're setting.
+ */
+ WT_ASSERT(session, old_type == 0 || old_type == __wt_cell_type(cell));
+ WT_UNUSED(old_type);
+
+ cell->__chunk[0] =
+ (cell->__chunk[0] & ~WT_CELL_TYPE_MASK) | WT_CELL_TYPE(new_type);
+}
+
+/*
+ * __wt_cell_leaf_value_parse --
+ * Return the cell if it's a row-store leaf page value, otherwise return
+ * NULL.
+ */
+static inline WT_CELL *
+__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
+{
+ /*
+ * This function exists so there's a place for this comment.
+ *
+ * Row-store leaf pages may have a single data cell between each key, or
+ * keys may be adjacent (when the data cell is empty).
+ *
+ * One special case: if the last key on a page is a key without a value,
+ * don't walk off the end of the page: the size of the underlying disk
+ * image is exact, which means the end of the last cell on the page plus
+ * the length of the cell should be the byte immediately after the page
+ * disk image.
+ *
+ * !!!
+ * This line of code is really a call to __wt_off_page, but we know the
+ * cell we're given will either be on the page or past the end of page,
+ * so it's a simpler check. (I wouldn't bother, but the real problem is
+ * we can't call __wt_off_page directly, it's in btree.i which requires
+ * this file be included first.)
+ */
+ if (cell >= (WT_CELL *)((uint8_t *)page->dsk + page->dsk->mem_size))
+ return (NULL);
+
+ switch (__wt_cell_type_raw(cell)) {
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_KEY_SHORT_PFX:
+ return (NULL);
+ default:
+ return (cell);
+ }
+}
+
+/*
+ * __wt_cell_unpack_safe --
+ * Unpack a WT_CELL into a structure during verification.
+ */
+static inline int
+__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
+{
+ uint64_t saved_v, v;
+ uint32_t saved_len;
+ int copied;
+ const uint8_t *p;
+
+ copied = 0;
+ saved_len = 0;
+ saved_v = 0;
+
+ /*
+ * The verification code specifies an end argument, a pointer to 1 past
+ * the end-of-page. In that case, make sure we don't go past the end
+ * of the page when reading. If an error occurs, we simply return the
+ * error code, the verification code takes care of complaining (and, in
+ * the case of salvage, it won't complain at all, it's OK to fail).
+ */
+#define WT_CELL_LEN_CHK(p, len) do { \
+ if (end != NULL && (((uint8_t *)p) + (len)) > end) \
+ return (WT_ERROR); \
+} while (0)
+
+restart:
+ /*
+ * This code is performance critical for scans through read-only trees.
+ * Avoid WT_CLEAR here: it makes this code run significantly slower.
+ */
+ WT_CLEAR_INLINE(WT_CELL_UNPACK, *unpack);
+ WT_CELL_LEN_CHK(cell, 0);
+ unpack->cell = cell;
+ unpack->type = __wt_cell_type(cell);
+ unpack->raw = __wt_cell_type_raw(cell);
+
+ /*
+ * Handle cells with neither an RLE count or data length: short key/data
+ * cells have 6 bits of data length in the descriptor byte.
+ */
+ switch (unpack->raw) {
+ case WT_CELL_KEY_SHORT_PFX:
+ WT_CELL_LEN_CHK(cell, 1); /* skip prefix */
+ unpack->prefix = cell->__chunk[1];
+
+ unpack->data = cell->__chunk + 2;
+ unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
+ unpack->__len = 2 + unpack->size;
+ goto done;
+ case WT_CELL_KEY_SHORT:
+ case WT_CELL_VALUE_SHORT:
+ unpack->data = cell->__chunk + 1;
+ unpack->size = cell->__chunk[0] >> WT_CELL_SHORT_SHIFT;
+ unpack->__len = 1 + unpack->size;
+ goto done;
+ }
+
+ p = (uint8_t *)cell + 1; /* skip cell */
+
+ /*
+ * Check for a prefix byte that optionally follows the cell descriptor
+ * byte on row-store leaf pages.
+ */
+ if (unpack->raw == WT_CELL_KEY_PFX) {
+ ++p; /* skip prefix */
+ WT_CELL_LEN_CHK(p, 0);
+ unpack->prefix = cell->__chunk[1];
+ }
+
+ /*
+ * Check for an RLE count or record number that optionally follows the
+ * cell descriptor byte on column-store variable-length pages.
+ */
+ if (cell->__chunk[0] & WT_CELL_64V) /* skip value */
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v));
+
+ /*
+ * Handle special actions for a few different cell types and set the
+ * data length (deleted cells are fixed-size without length bytes,
+ * almost everything else has data length bytes).
+ */
+ switch (unpack->raw) {
+ case WT_CELL_VALUE_COPY:
+ /*
+ * The cell is followed by an offset to a cell written earlier
+ * in the page. Save/restore the length and RLE of this cell,
+ * we need the length to step through the set of cells on the
+ * page and this RLE is probably different from the RLE of the
+ * earlier cell.
+ */
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : (size_t)(end - p), &v));
+ saved_len = WT_PTRDIFF32(p, cell);
+ saved_v = unpack->v;
+ cell = (WT_CELL *)((uint8_t *)cell - v);
+ copied = 1;
+ goto restart;
+
+ case WT_CELL_KEY_OVFL:
+ case WT_CELL_KEY_OVFL_RM:
+ case WT_CELL_VALUE_OVFL:
+ case WT_CELL_VALUE_OVFL_RM:
+ /*
+ * Set overflow flag.
+ */
+ unpack->ovfl = 1;
+ /* FALLTHROUGH */
+
+ case WT_CELL_ADDR_DEL:
+ case WT_CELL_ADDR_INT:
+ case WT_CELL_ADDR_LEAF:
+ case WT_CELL_ADDR_LEAF_NO:
+ case WT_CELL_KEY:
+ case WT_CELL_KEY_PFX:
+ case WT_CELL_VALUE:
+ /*
+ * The cell is followed by a 4B data length and a chunk of
+ * data.
+ */
+ WT_RET(__wt_vunpack_uint(
+ &p, end == NULL ? 0 : (size_t)(end - p), &v));
+
+ if (unpack->raw == WT_CELL_KEY ||
+ unpack->raw == WT_CELL_KEY_PFX ||
+ (unpack->raw == WT_CELL_VALUE && unpack->v == 0))
+ v += WT_CELL_SIZE_ADJUST;
+
+ unpack->data = p;
+ unpack->size = (uint32_t)v;
+ unpack->__len = WT_PTRDIFF32(p + unpack->size, cell);
+ break;
+
+ case WT_CELL_DEL:
+ unpack->__len = WT_PTRDIFF32(p, cell);
+ break;
+ default:
+ return (WT_ERROR); /* Unknown cell type. */
+ }
+
+ /*
+ * Check the original cell against the full cell length (this is a
+ * diagnostic as well, we may be copying the cell from the page and
+ * we need the right length).
+ */
+done: WT_CELL_LEN_CHK(cell, unpack->__len);
+ if (copied) {
+ unpack->raw = WT_CELL_VALUE_COPY;
+ unpack->__len = saved_len;
+ unpack->v = saved_v;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_cell_unpack --
+ * Unpack a WT_CELL into a structure.
+ */
+static inline void
+__wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
+{
+ (void)__wt_cell_unpack_safe(cell, unpack, NULL);
+}
+
+/*
+ * __cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell.
+ */
+static inline int
+__cell_data_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_BTREE *btree;
+ void *huffman;
+
+ btree = S2BT(session);
+
+ /* Reference the cell's data, optionally decode it. */
+ switch (unpack->type) {
+ case WT_CELL_KEY:
+ store->data = unpack->data;
+ store->size = unpack->size;
+ if (page_type == WT_PAGE_ROW_INT)
+ return (0);
+
+ huffman = btree->huffman_key;
+ break;
+ case WT_CELL_VALUE:
+ store->data = unpack->data;
+ store->size = unpack->size;
+ huffman = btree->huffman_value;
+ break;
+ case WT_CELL_KEY_OVFL:
+ WT_RET(__wt_ovfl_read(session, page, unpack, store));
+ if (page_type == WT_PAGE_ROW_INT)
+ return (0);
+
+ huffman = btree->huffman_key;
+ break;
+ case WT_CELL_VALUE_OVFL:
+ WT_RET(__wt_ovfl_read(session, page, unpack, store));
+ huffman = btree->huffman_value;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (huffman == NULL ? 0 :
+ __wt_huffman_decode(
+ session, huffman, store->data, store->size, store));
+}
+
+/*
+ * __wt_dsk_cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell.
+ *
+ * There are two versions because of WT_CELL_VALUE_OVFL_RM type cells. When an
+ * overflow item is deleted, its backing blocks are removed; if there are still
+ * running transactions that might need to see the overflow item, we cache a
+ * copy of the item and reset the item's cell to WT_CELL_VALUE_OVFL_RM. If we
+ * find a WT_CELL_VALUE_OVFL_RM cell when reading an overflow item, we use the
+ * page reference to look aside into the cache. So, calling the "dsk" version
+ * of the function declares the cell cannot be of type WT_CELL_VALUE_OVFL_RM,
+ * and calling the "page" version means it might be.
+ */
+static inline int
+__wt_dsk_cell_data_ref(WT_SESSION_IMPL *session,
+ int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ WT_ASSERT(session,
+ __wt_cell_type_raw(unpack->cell) != WT_CELL_VALUE_OVFL_RM);
+ return (__cell_data_ref(session, NULL, page_type, unpack, store));
+}
+
+/*
+ * __wt_page_cell_data_ref --
+ * Set a buffer to reference the data from an unpacked cell.
+ */
+static inline int
+__wt_page_cell_data_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ return (__cell_data_ref(session, page, page->type, unpack, store));
+}
+
+/*
+ * __wt_cell_data_copy --
+ * Copy the data from an unpacked cell into a buffer.
+ */
+static inline int
+__wt_cell_data_copy(WT_SESSION_IMPL *session,
+ int page_type, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+{
+ /*
+ * We have routines to both copy and reference a cell's information. In
+ * most cases, all we need is a reference and we prefer that, especially
+ * when returning key/value items. In a few we need a real copy: call
+ * the standard reference function and get a reference. In some cases,
+ * a copy will be made (for example, when reading an overflow item from
+ * the underlying object. If that happens, we're done, otherwise make
+ * a copy.
+ *
+ * We don't require two versions of this function, no callers need to
+ * handle WT_CELL_VALUE_OVFL_RM cells.
+ */
+ WT_RET(__wt_dsk_cell_data_ref(session, page_type, unpack, store));
+ if (!WT_DATA_IN_ITEM(store))
+ WT_RET(__wt_buf_set(session, store, store->data, store->size));
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i
new file mode 100644
index 00000000000..42c3664323d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/column.i
@@ -0,0 +1,201 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __col_insert_search_match --
+ * Search an column-store insert list for an exact match.
+ */
+static inline WT_INSERT *
+__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno)
+{
+ WT_INSERT **insp, *ret_ins;
+ uint64_t ins_recno;
+ int cmp, i;
+
+ /* If there's no insert chain to search, we're done. */
+ if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL)
+ return (NULL);
+
+ /* Fast path the check for values at the end of the skiplist. */
+ if (recno > WT_INSERT_RECNO(ret_ins))
+ return (NULL);
+ else if (recno == WT_INSERT_RECNO(ret_ins))
+ return (ret_ins);
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) {
+ if (*insp == NULL) {
+ --i;
+ --insp;
+ continue;
+ }
+
+ ins_recno = WT_INSERT_RECNO(*insp);
+ cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+ if (cmp == 0) /* Exact match: return */
+ return (*insp);
+ else if (cmp > 0) /* Keep going at this level */
+ insp = &(*insp)->next[i];
+ else { /* Drop down a level */
+ --i;
+ --insp;
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * __col_insert_search --
+ * Search a column-store insert list, creating a skiplist stack as we go.
+ */
+static inline WT_INSERT *
+__col_insert_search(WT_INSERT_HEAD *inshead,
+ WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno)
+{
+ WT_INSERT **insp, *ret_ins;
+ uint64_t ins_recno;
+ int cmp, i;
+
+ /* If there's no insert chain to search, we're done. */
+ if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL)
+ return (NULL);
+
+ /* Fast path appends. */
+ if (recno >= WT_INSERT_RECNO(ret_ins)) {
+ for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
+ ins_stack[i] = (i == 0) ? &ret_ins->next[0] :
+ (inshead->tail[i] != NULL) ?
+ &inshead->tail[i]->next[i] : &inshead->head[i];
+ next_stack[i] = NULL;
+ }
+ return (ret_ins);
+ }
+
+ /*
+ * The insert list is a skip list: start at the highest skip level, then
+ * go as far as possible at each level before stepping down to the next.
+ */
+ for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) {
+ if ((ret_ins = *insp) == NULL) {
+ next_stack[i] = NULL;
+ ins_stack[i--] = insp--;
+ continue;
+ }
+
+ ins_recno = WT_INSERT_RECNO(ret_ins);
+ cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1;
+
+ if (cmp > 0) /* Keep going at this level */
+ insp = &ret_ins->next[i];
+ else if (cmp == 0) /* Exact match: return */
+ for (; i >= 0; i--) {
+ next_stack[i] = ret_ins->next[i];
+ ins_stack[i] = &ret_ins->next[i];
+ }
+ else { /* Drop down a level */
+ next_stack[i] = ret_ins;
+ ins_stack[i--] = insp--;
+ }
+ }
+ return (ret_ins);
+}
+
+/*
+ * __col_var_last_recno --
+ * Return the last record number for a variable-length column-store page.
+ */
+static inline uint64_t
+__col_var_last_recno(WT_PAGE *page)
+{
+ WT_COL_RLE *repeat;
+
+ /*
+ * If there's an append list (the last page), then there may be more
+ * records on the page. This function ignores those records, so our
+ * callers have to handle that explicitly, if they care.
+ */
+ if (page->pg_var_nrepeats == 0)
+ return (page->pg_var_entries == 0 ? 0 :
+ page->pg_var_recno + (page->pg_var_entries - 1));
+
+ repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1];
+ return ((repeat->recno + repeat->rle) - 1 +
+ (page->pg_var_entries - (repeat->indx + 1)));
+}
+
+/*
+ * __col_fix_last_recno --
+ * Return the last record number for a fixed-length column-store page.
+ */
+static inline uint64_t
+__col_fix_last_recno(WT_PAGE *page)
+{
+ /*
+ * If there's an append list (the last page), then there may be more
+ * records on the page. This function ignores those records, so our
+ * callers have to handle that explicitly, if they care.
+ */
+ return (page->pg_fix_entries == 0 ? 0 :
+ page->pg_fix_recno + (page->pg_fix_entries - 1));
+}
+
+/*
+ * __col_var_search --
+ * Search a variable-length column-store page for a record.
+ */
+static inline WT_COL *
+__col_var_search(WT_PAGE *page, uint64_t recno)
+{
+ WT_COL_RLE *repeat;
+ uint64_t start_recno;
+ uint32_t base, indx, limit, start_indx;
+
+ /*
+ * Find the matching slot.
+ *
+ * This is done in two stages: first, we do a binary search among any
+ * repeating records to find largest repeating less than the search key.
+ * Once there, we can do a simple offset calculation to find the correct
+ * slot for this record number, because we know any intervening records
+ * have repeat counts of 1.
+ */
+ for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+
+ repeat = page->pg_var_repeats + indx;
+ if (recno >= repeat->recno &&
+ recno < repeat->recno + repeat->rle)
+ return (page->pg_var_d + repeat->indx);
+ if (recno < repeat->recno)
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+
+ /*
+ * We didn't find an exact match, move forward from the largest repeat
+ * less than the search key.
+ */
+ if (base == 0) {
+ start_indx = 0;
+ start_recno = page->pg_var_recno;
+ } else {
+ repeat = page->pg_var_repeats + (base - 1);
+ start_indx = repeat->indx + 1;
+ start_recno = repeat->recno + repeat->rle;
+ }
+
+ if (recno >= start_recno + (page->pg_var_entries - start_indx))
+ return (NULL);
+
+ return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno));
+}
diff --git a/src/third_party/wiredtiger/src/include/compact.h b/src/third_party/wiredtiger/src/include/compact.h
new file mode 100644
index 00000000000..aa34eab4d24
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/compact.h
@@ -0,0 +1,12 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_compact {
+ uint32_t lsm_count; /* Number of LSM trees seen */
+ uint32_t file_count; /* Number of files seen */
+ uint64_t max_time; /* Configured timeout */
+};
diff --git a/src/third_party/wiredtiger/src/include/config.h b/src/third_party/wiredtiger/src/include/config.h
new file mode 100644
index 00000000000..b9c4c97fa00
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/config.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_config {
+ WT_SESSION_IMPL *session;
+ const char *orig;
+ const char *end;
+ const char *cur;
+
+ int depth, top;
+ const int8_t *go;
+};
+
+struct __wt_config_check {
+ const char *name;
+ const char *type;
+ const char *checks;
+ const WT_CONFIG_CHECK *subconfigs;
+};
+
+#define WT_CONFIG_REF(session, n) \
+ (S2C(session)->config_entries[WT_CONFIG_ENTRY_##n])
+struct __wt_config_entry {
+ const char *method; /* method name */
+
+#define WT_CONFIG_BASE(session, n) (WT_CONFIG_REF(session, n)->base)
+ const char *base; /* configuration base */
+
+ const WT_CONFIG_CHECK *checks; /* check array */
+};
+
+struct __wt_config_parser_impl {
+ WT_CONFIG_PARSER iface;
+
+ WT_SESSION_IMPL *session;
+ WT_CONFIG config;
+ WT_CONFIG_ITEM config_item;
+};
+
+/*
+ * DO NOT EDIT: automatically built by dist/api_config.py.
+ * configuration section: BEGIN
+ */
+#define WT_CONFIG_ENTRY_colgroup_meta 0
+#define WT_CONFIG_ENTRY_connection_add_collator 1
+#define WT_CONFIG_ENTRY_connection_add_compressor 2
+#define WT_CONFIG_ENTRY_connection_add_data_source 3
+#define WT_CONFIG_ENTRY_connection_add_extractor 4
+#define WT_CONFIG_ENTRY_connection_async_new_op 5
+#define WT_CONFIG_ENTRY_connection_close 6
+#define WT_CONFIG_ENTRY_connection_load_extension 7
+#define WT_CONFIG_ENTRY_connection_open_session 8
+#define WT_CONFIG_ENTRY_connection_reconfigure 9
+#define WT_CONFIG_ENTRY_cursor_close 10
+#define WT_CONFIG_ENTRY_file_meta 11
+#define WT_CONFIG_ENTRY_index_meta 12
+#define WT_CONFIG_ENTRY_session_begin_transaction 13
+#define WT_CONFIG_ENTRY_session_checkpoint 14
+#define WT_CONFIG_ENTRY_session_close 15
+#define WT_CONFIG_ENTRY_session_commit_transaction 16
+#define WT_CONFIG_ENTRY_session_compact 17
+#define WT_CONFIG_ENTRY_session_create 18
+#define WT_CONFIG_ENTRY_session_drop 19
+#define WT_CONFIG_ENTRY_session_log_printf 20
+#define WT_CONFIG_ENTRY_session_open_cursor 21
+#define WT_CONFIG_ENTRY_session_reconfigure 22
+#define WT_CONFIG_ENTRY_session_rename 23
+#define WT_CONFIG_ENTRY_session_rollback_transaction 24
+#define WT_CONFIG_ENTRY_session_salvage 25
+#define WT_CONFIG_ENTRY_session_truncate 26
+#define WT_CONFIG_ENTRY_session_upgrade 27
+#define WT_CONFIG_ENTRY_session_verify 28
+#define WT_CONFIG_ENTRY_table_meta 29
+#define WT_CONFIG_ENTRY_wiredtiger_open 30
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 31
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 32
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 33
+/*
+ * configuration section: END
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
new file mode 100644
index 00000000000..81866e39df9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -0,0 +1,270 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*******************************************
+ * Global per-process structure.
+ *******************************************/
+/*
+ * WT_PROCESS --
+ * Per-process information for the library.
+ */
+struct __wt_process {
+ WT_SPINLOCK spinlock; /* Per-process spinlock */
+
+ /* Locked: connection queue */
+ TAILQ_HEAD(__wt_connection_impl_qh, __wt_connection_impl) connqh;
+ WT_CACHE_POOL *cache_pool;
+};
+extern WT_PROCESS __wt_process;
+
+/*
+ * WT_NAMED_COLLATOR --
+ * A collator list entry
+ */
+struct __wt_named_collator {
+ const char *name; /* Name of collator */
+ WT_COLLATOR *collator; /* User supplied object */
+ TAILQ_ENTRY(__wt_named_collator) q; /* Linked list of collators */
+};
+
+/*
+ * WT_NAMED_COMPRESSOR --
+ * A compressor list entry
+ */
+struct __wt_named_compressor {
+ const char *name; /* Name of compressor */
+ WT_COMPRESSOR *compressor; /* User supplied callbacks */
+ /* Linked list of compressors */
+ TAILQ_ENTRY(__wt_named_compressor) q;
+};
+
+/*
+ * WT_NAMED_DATA_SOURCE --
+ * A data source list entry
+ */
+struct __wt_named_data_source {
+ const char *prefix; /* Name of data source */
+ WT_DATA_SOURCE *dsrc; /* User supplied callbacks */
+ /* Linked list of data sources */
+ TAILQ_ENTRY(__wt_named_data_source) q;
+};
+
+/*
+ * Allocate some additional slots for internal sessions. There is a default
+ * session for each connection, plus a session for each server thread.
+ */
+#define WT_NUM_INTERNAL_SESSIONS 10
+
+/*
+ * WT_CONNECTION_IMPL --
+ * Implementation of WT_CONNECTION
+ */
+struct __wt_connection_impl {
+ WT_CONNECTION iface;
+
+ /* For operations without an application-supplied session */
+ WT_SESSION_IMPL *default_session;
+ WT_SESSION_IMPL dummy_session;
+
+ const char *cfg; /* Connection configuration */
+
+ WT_SPINLOCK api_lock; /* Connection API spinlock */
+ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */
+ WT_SPINLOCK fh_lock; /* File handle queue spinlock */
+ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */
+ WT_SPINLOCK schema_lock; /* Schema operation spinlock */
+
+ /*
+ * We distribute the btree page locks across a set of spin locks; it
+ * can't be an array, we impose cache-line alignment and gcc doesn't
+ * support that for arrays. Don't use too many: they are only held for
+ * very short operations, each one is 64 bytes, so 256 will fill the L1
+ * cache on most CPUs.
+ */
+#define WT_PAGE_LOCKS(conn) 16
+ WT_SPINLOCK *page_lock; /* Btree page spinlocks */
+ u_int page_lock_cnt; /* Next spinlock to use */
+
+ /* Connection queue */
+ TAILQ_ENTRY(__wt_connection_impl) q;
+ /* Cache pool queue */
+ TAILQ_ENTRY(__wt_connection_impl) cpq;
+
+ const char *home; /* Database home */
+ const char *error_prefix; /* Database error prefix */
+ int is_new; /* Connection created database */
+
+ WT_EXTENSION_API extension_api; /* Extension API */
+
+ /* Configuration */
+ const WT_CONFIG_ENTRY **config_entries;
+
+ void **foc; /* Free-on-close array */
+ size_t foc_cnt; /* Array entries */
+ size_t foc_size; /* Array size */
+
+ WT_FH *lock_fh; /* Lock file handle */
+
+ uint64_t split_gen; /* Generation number for splits */
+
+ WT_SPINLOCK dhandle_lock; /* Locked: dhandle sweep */
+ /* Locked: data handle list */
+ SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh;
+ /* Locked: LSM handle list. */
+ TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh;
+ /* Locked: file list */
+ TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
+ /* Locked: library list */
+ TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh;
+
+ WT_SPINLOCK block_lock; /* Locked: block manager list */
+ TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh;
+
+ u_int open_btree_count; /* Locked: open writable btree count */
+ uint32_t next_file_id; /* Locked: file ID counter */
+
+ /*
+ * WiredTiger allocates space for 50 simultaneous sessions (threads of
+ * control) by default. Growing the number of threads dynamically is
+ * possible, but tricky since server threads are walking the array
+ * without locking it.
+ *
+ * There's an array of WT_SESSION_IMPL pointers that reference the
+ * allocated array; we do it that way because we want an easy way for
+ * the server thread code to avoid walking the entire array when only a
+ * few threads are running.
+ */
+ WT_SESSION_IMPL *sessions; /* Session reference */
+ uint32_t session_size; /* Session array size */
+ uint32_t session_cnt; /* Session count */
+
+ /*
+ * WiredTiger allocates space for a fixed number of hazard pointers
+ * in each thread of control.
+ */
+ uint32_t hazard_max; /* Hazard array size */
+
+ WT_CACHE *cache; /* Page cache */
+ uint64_t cache_size;
+
+ WT_TXN_GLOBAL txn_global; /* Global transaction state */
+
+ WT_SPINLOCK hot_backup_lock; /* Hot backup serialization */
+ int hot_backup;
+
+ WT_SESSION_IMPL *ckpt_session; /* Checkpoint thread session */
+ wt_thread_t ckpt_tid; /* Checkpoint thread */
+ int ckpt_tid_set; /* Checkpoint thread set */
+ WT_CONDVAR *ckpt_cond; /* Checkpoint wait mutex */
+ const char *ckpt_config; /* Checkpoint configuration */
+#define WT_CKPT_LOGSIZE(conn) ((conn)->ckpt_logsize != 0)
+ wt_off_t ckpt_logsize; /* Checkpoint log size period */
+ uint32_t ckpt_signalled; /* Checkpoint signalled */
+ long ckpt_usecs; /* Checkpoint period */
+
+ int compact_in_memory_pass; /* Compaction serialization */
+
+#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */
+#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */
+#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */
+#define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */
+#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */
+ uint32_t stat_flags;
+
+ WT_CONNECTION_STATS stats; /* Connection statistics */
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ /*
+ * Spinlock registration, so we can track which spinlocks are heavily
+ * used, which are blocking and where.
+ *
+ * There's an array of spinlocks, and an array of blocking IDs.
+ */
+#define WT_SPINLOCK_MAX 1024
+#define WT_SPINLOCK_MAX_LOCATION_ID 60
+ WT_SPINLOCK *spinlock_list[WT_SPINLOCK_MAX];
+
+ /* Spinlock blocking matrix */
+ struct __wt_connection_stats_spinlock {
+ const char *name; /* Mutex name */
+
+ const char *file; /* Caller's file/line, ID location */
+ int line;
+
+ u_int total; /* Count of total, blocked calls */
+ u_int blocked[WT_SPINLOCK_MAX_LOCATION_ID];
+ } spinlock_block[WT_SPINLOCK_MAX_LOCATION_ID];
+#endif
+
+ WT_ASYNC *async; /* Async structure */
+ int async_cfg; /* Global async configuration */
+ uint32_t async_size; /* Async op array size */
+ uint32_t async_workers; /* Number of async workers */
+
+ WT_LSM_MANAGER lsm_manager; /* LSM worker thread information */
+
+ WT_SESSION_IMPL *evict_session; /* Eviction server sessions */
+ wt_thread_t evict_tid; /* Eviction server thread ID */
+ int evict_tid_set; /* Eviction server thread ID set */
+
+ uint32_t evict_workers_max;/* Max eviction workers */
+ uint32_t evict_workers_min;/* Min eviction workers */
+ uint32_t evict_workers; /* Number of eviction workers */
+ WT_EVICT_WORKER *evict_workctx; /* Eviction worker context */
+
+ WT_SESSION_IMPL *stat_session; /* Statistics log session */
+ wt_thread_t stat_tid; /* Statistics log thread */
+ int stat_tid_set; /* Statistics log thread set */
+ WT_CONDVAR *stat_cond; /* Statistics log wait mutex */
+ const char *stat_format; /* Statistics log timestamp format */
+ FILE *stat_fp; /* Statistics log file handle */
+ char *stat_path; /* Statistics log path format */
+ char **stat_sources; /* Statistics log list of objects */
+ const char *stat_stamp; /* Statistics log entry timestamp */
+ long stat_usecs; /* Statistics log period */
+
+ int logging; /* Global logging configuration */
+ int archive; /* Global archive configuration */
+ WT_CONDVAR *arch_cond; /* Log archive wait mutex */
+ WT_SESSION_IMPL *arch_session; /* Log archive session */
+ wt_thread_t arch_tid; /* Log archive thread */
+ int arch_tid_set; /* Log archive thread set */
+ WT_LOG *log; /* Logging structure */
+ wt_off_t log_file_max; /* Log file max size */
+ const char *log_path; /* Logging path format */
+ uint32_t txn_logsync; /* Log sync configuration */
+
+ WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
+ wt_thread_t sweep_tid; /* Handle sweep thread */
+ int sweep_tid_set; /* Handle sweep thread set */
+ WT_CONDVAR *sweep_cond; /* Handle sweep wait mutex */
+
+ /* Locked: collator list */
+ TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
+
+ /* Locked: compressor list */
+ TAILQ_HEAD(__wt_comp_qh, __wt_named_compressor) compqh;
+
+ /* Locked: data source list */
+ TAILQ_HEAD(__wt_dsrc_qh, __wt_named_data_source) dsrcqh;
+
+ void *lang_private; /* Language specific private storage */
+
+ /* If non-zero, all buffers used for I/O will be aligned to this. */
+ size_t buffer_alignment;
+
+ uint32_t schema_gen; /* Schema generation number */
+
+ wt_off_t data_extend_len; /* file_extend data length */
+ wt_off_t log_extend_len; /* file_extend log length */
+
+ uint32_t direct_io; /* O_DIRECT file type flags */
+ int mmap; /* mmap configuration */
+ uint32_t verbose;
+
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h
new file mode 100644
index 00000000000..17185499b88
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cursor.h
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Initialize a static WT_CURSOR structure.
+ */
+#define WT_CURSOR_STATIC_INIT(n, \
+ get_key, \
+ get_value, \
+ set_key, \
+ set_value, \
+ compare, \
+ next, \
+ prev, \
+ reset, \
+ search, \
+ search_near, \
+ insert, \
+ update, \
+ remove, \
+ close) \
+ static const WT_CURSOR n = { \
+ NULL, /* session */ \
+ NULL, /* uri */ \
+ NULL, /* key_format */ \
+ NULL, /* value_format */ \
+ (int (*)(WT_CURSOR *, ...))(get_key), \
+ (int (*)(WT_CURSOR *, ...))(get_value), \
+ (void (*)(WT_CURSOR *, ...))(set_key), \
+ (void (*)(WT_CURSOR *, ...))(set_value), \
+ (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare), \
+ next, \
+ prev, \
+ reset, \
+ search, \
+ (int (*)(WT_CURSOR *, int *))(search_near), \
+ insert, \
+ update, \
+ remove, \
+ close, \
+ { NULL, NULL }, /* TAILQ_ENTRY q */ \
+ 0, /* recno key */ \
+ { 0 }, /* recno raw buffer */ \
+ NULL, /* json_private */ \
+ NULL, /* lang_private */ \
+ { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \
+ { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \
+ 0, /* int saved_err */ \
+ NULL, /* internal_uri */ \
+ 0 /* uint32_t flags */ \
+}
+
+struct __wt_cursor_backup_entry {
+ char *name; /* File name */
+ WT_DATA_HANDLE *handle; /* Handle */
+};
+struct __wt_cursor_backup {
+ WT_CURSOR iface;
+
+ size_t next; /* Cursor position */
+ FILE *bfp; /* Backup file */
+
+ WT_CURSOR_BACKUP_ENTRY *list; /* List of files to be copied. */
+ size_t list_allocated;
+ size_t list_next;
+};
+
+struct __wt_cursor_btree {
+ WT_CURSOR iface;
+
+ WT_BTREE *btree; /* Enclosing btree */
+
+ /*
+ * The following fields are set by the search functions as a precursor
+ * to page modification: we have a page, a WT_COL/WT_ROW slot on the
+ * page, an insert head, insert list and a skiplist stack (the stack of
+ * skiplist entries leading to the insert point). The search functions
+ * also return the relationship of the search key to the found key.
+ */
+ WT_REF *ref; /* Current page */
+ uint32_t slot; /* WT_COL/WT_ROW 0-based slot */
+
+ WT_INSERT_HEAD *ins_head; /* Insert chain head */
+ WT_INSERT *ins; /* Current insert node */
+ /* Search stack */
+ WT_INSERT **ins_stack[WT_SKIP_MAXDEPTH];
+
+ /* Next item(s) found during search */
+ WT_INSERT *next_stack[WT_SKIP_MAXDEPTH];
+
+ uint64_t recno; /* Record number */
+
+ /*
+ * The search function sets compare to:
+ * < 1 if the found key is less than the specified key
+ * 0 if the found key matches the specified key
+ * > 1 if the found key is larger than the specified key
+ */
+ int compare;
+
+ /*
+ * The key value from a binary search of a row-store files; we keep a
+ * copy of the last key we retrieved in the search, it avoids having
+ * doing the additional work of getting the key again for return to
+ * the application.
+ */
+ WT_ITEM search_key;
+
+ /*
+ * It's relatively expensive to calculate the last record on a variable-
+ * length column-store page because of the repeat values. Calculate it
+ * once per page and cache it. This value doesn't include the skiplist
+ * of appended entries on the last page.
+ */
+ uint64_t last_standard_recno;
+
+ /*
+ * For row-store pages, we need a single item that tells us the part of
+ * the page we're walking (otherwise switching from next to prev and
+ * vice-versa is just too complicated), so we map the WT_ROW and
+ * WT_INSERT_HEAD insert array slots into a single name space: slot 1
+ * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
+ * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
+ * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
+ */
+ uint32_t row_iteration_slot; /* Row-store iteration slot */
+
+ /*
+ * Variable-length column-store values are run-length encoded and may
+ * be overflow values or Huffman encoded. To avoid repeatedly reading
+ * overflow values or decompressing encoded values, process it once and
+ * store the result in a temporary buffer. The cip_saved field is used
+ * to determine if we've switched columns since our last cursor call.
+ */
+ WT_COL *cip_saved; /* Last iteration reference */
+
+ /*
+ * We don't instantiate prefix-compressed keys on pages where there's no
+ * Huffman encoding because we don't want to waste memory if only moving
+ * a cursor through the page, and it's faster to build keys while moving
+ * through the page than to roll-forward from a previously instantiated
+ * key (we don't instantiate all of the keys, just the ones at binary
+ * search points). We can't use the application's WT_CURSOR key field
+ * as a copy of the last-returned key because it may have been altered
+ * by the API layer, for example, dump cursors. Instead we store the
+ * last-returned key in a temporary buffer. The rip_saved field is used
+ * to determine if the key in the temporary buffer has the prefix needed
+ * for building the current key.
+ */
+ WT_ROW *rip_saved; /* Last-returned key reference */
+
+ /*
+ * A temporary buffer for caching RLE values for column-store files.
+ */
+ WT_ITEM tmp;
+
+ /*
+ * The update structure allocated by the row- and column-store modify
+ * functions, used to avoid a data copy in the WT_CURSOR.update call.
+ */
+ WT_UPDATE *modify_update;
+
+ /*
+ * Fixed-length column-store items are a single byte, and it's simpler
+ * and cheaper to allocate the space for it now than keep checking to
+ * see if we need to grow the buffer.
+ */
+ uint8_t v; /* Fixed-length return value */
+
+ uint8_t append_tree; /* Cursor appended to the tree */
+
+#define WT_CBT_ACTIVE 0x01 /* Active in the tree */
+#define WT_CBT_ITERATE_APPEND 0x02 /* Col-store: iterating append list */
+#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
+#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
+#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
+#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
+ uint8_t flags;
+};
+
+struct __wt_cursor_bulk {
+ WT_CURSOR_BTREE cbt;
+
+ WT_REF *ref; /* The leaf page */
+ WT_PAGE *leaf;
+
+ /*
+ * Variable-length column store compares values during bulk load as
+ * part of RLE compression, row-store compares keys during bulk load
+ * to avoid corruption.
+ */
+ WT_ITEM last; /* Last key/value seen */
+
+ /*
+ * Variable-length column-store RLE counter (also overloaded to mean
+ * the first time through the bulk-load insert routine, when set to 0).
+ */
+ uint64_t rle;
+
+ /*
+ * Fixed-length column-store current entry in memory chunk count, and
+ * the maximum number of records per chunk.
+ */
+ uint32_t entry; /* Entry count */
+ uint32_t nrecs; /* Max records per chunk */
+
+ /* Special bitmap bulk load for fixed-length column stores. */
+ int bitmap;
+
+ void *reconcile; /* Reconciliation information */
+};
+
+struct __wt_cursor_config {
+ WT_CURSOR iface;
+};
+
+struct __wt_cursor_data_source {
+ WT_CURSOR iface;
+
+ WT_COLLATOR *collator; /* Configured collator */
+ int collator_owned; /* Collator needs to be terminated */
+
+ WT_CURSOR *source; /* Application-owned cursor */
+};
+
+struct __wt_cursor_dump {
+ WT_CURSOR iface;
+
+ WT_CURSOR *child;
+};
+
+struct __wt_cursor_index {
+ WT_CURSOR iface;
+
+ WT_TABLE *table;
+ WT_INDEX *index;
+ const char *key_plan, *value_plan;
+
+ WT_CURSOR *child;
+ WT_CURSOR **cg_cursors;
+};
+
+struct __wt_cursor_json {
+ char *key_buf; /* JSON formatted string */
+ char *value_buf; /* JSON formatted string */
+ WT_CONFIG_ITEM key_names; /* Names of key columns */
+ WT_CONFIG_ITEM value_names; /* Names of value columns */
+};
+
+struct __wt_cursor_log {
+ WT_CURSOR iface;
+
+ WT_LSN *cur_lsn; /* LSN of current record */
+ WT_LSN *next_lsn; /* LSN of next record */
+ WT_ITEM *logrec; /* Copy of record for cursor */
+ WT_ITEM *opkey, *opvalue; /* Op key/value copy */
+ const uint8_t *stepp, *stepp_end; /* Pointer within record */
+ uint32_t step_count; /* Intra-record count */
+ uint32_t rectype; /* Record type */
+ uint64_t txnid; /* Record txnid */
+ uint32_t flags;
+};
+
+struct __wt_cursor_metadata {
+ WT_CURSOR iface;
+
+ WT_CURSOR *file_cursor; /* Queries of regular metadata */
+
+#define WT_MDC_POSITIONED 0x01
+#define WT_MDC_ONMETADATA 0x02
+ uint32_t flags;
+};
+
+struct __wt_cursor_stat {
+ WT_CURSOR iface;
+
+ int notpositioned; /* Cursor not positioned */
+
+ WT_STATS *stats; /* Stats owned by the cursor */
+ WT_STATS *stats_first; /* First stats reference */
+ int stats_base; /* Base statistics value */
+ int stats_count; /* Count of stats elements */
+
+ union { /* Copies of the statistics */
+ WT_DSRC_STATS dsrc_stats;
+ WT_CONNECTION_STATS conn_stats;
+ } u;
+
+ int key; /* Current stats key */
+ uint64_t v; /* Current stats value */
+ WT_ITEM pv; /* Current stats value (string) */
+
+ /* Uses the same values as WT_CONNECTION::stat_flags field */
+ uint32_t flags;
+};
+
+/*
+ * WT_CURSOR_STATS --
+ * Return a reference to a statistic cursor's stats structures; use the
+ * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter
+ * is NULL when non-cursor memory is used to hold the statistics.
+ */
+#define WT_CURSOR_STATS(cursor) \
+ (((WT_CURSOR_STAT *)cursor)->stats_first)
+
+struct __wt_cursor_table {
+ WT_CURSOR iface;
+
+ WT_TABLE *table;
+ const char *plan;
+
+ const char **cfg; /* Saved configuration string */
+
+ WT_CURSOR **cg_cursors;
+ WT_CURSOR **idx_cursors;
+};
+
+#define WT_CURSOR_PRIMARY(cursor) \
+ (((WT_CURSOR_TABLE *)cursor)->cg_cursors[0])
+
+#define WT_CURSOR_RECNO(cursor) WT_STREQ((cursor)->key_format, "r")
+
+/*
+ * WT_CURSOR_NEEDKEY, WT_CURSOR_NEEDVALUE --
+ * Check if we have a key/value set. There's an additional semantic
+ * implemented here: if we're pointing into the tree, and about to perform
+ * a cursor operation, get a local copy of whatever we're referencing in
+ * the tree, there's an obvious race with the cursor moving and the key or
+ * value reference, and it's better to solve it here than in the underlying
+ * data-source layers.
+ *
+ * WT_CURSOR_CHECKKEY --
+ * Check if a key is set without making a copy.
+ *
+ * WT_CURSOR_NOVALUE --
+ * Release any cached value before an operation that could update the
+ * transaction context and free data a value is pointing to.
+ */
+#define WT_CURSOR_CHECKKEY(cursor) do { \
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) \
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 1)); \
+} while (0)
+#define WT_CURSOR_CHECKVALUE(cursor) do { \
+ if (!F_ISSET(cursor, WT_CURSTD_VALUE_SET)) \
+ WT_ERR(__wt_cursor_kv_not_set(cursor, 0)); \
+} while (0)
+#define WT_CURSOR_NEEDKEY(cursor) do { \
+ if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { \
+ if (!WT_DATA_IN_ITEM(&(cursor)->key)) \
+ WT_ERR(__wt_buf_set( \
+ (WT_SESSION_IMPL *)(cursor)->session, \
+ &(cursor)->key, \
+ (cursor)->key.data, (cursor)->key.size)); \
+ F_CLR(cursor, WT_CURSTD_KEY_INT); \
+ F_SET(cursor, WT_CURSTD_KEY_EXT); \
+ } \
+ WT_CURSOR_CHECKKEY(cursor); \
+} while (0)
+#define WT_CURSOR_NEEDVALUE(cursor) do { \
+ if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) { \
+ if (!WT_DATA_IN_ITEM(&(cursor)->value)) \
+ WT_ERR(__wt_buf_set( \
+ (WT_SESSION_IMPL *)(cursor)->session, \
+ &(cursor)->value, \
+ (cursor)->value.data, (cursor)->value.size));\
+ F_CLR(cursor, WT_CURSTD_VALUE_INT); \
+ F_SET(cursor, WT_CURSTD_VALUE_EXT); \
+ } \
+ WT_CURSOR_CHECKVALUE(cursor); \
+} while (0)
+#define WT_CURSOR_NOVALUE(cursor) do { \
+ F_CLR(cursor, WT_CURSTD_VALUE_INT); \
+} while (0)
+
+#define WT_CURSOR_RAW_OK \
+ WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW
diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i
new file mode 100644
index 00000000000..7f8e83643c5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/cursor.i
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __cursor_set_recno --
+ * The cursor value in the interface has to track the value in the
+ * underlying cursor, update them in parallel.
+ */
+static inline void
+__cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v)
+{
+ cbt->iface.recno = cbt->recno = v;
+}
+
+/*
+ * __cursor_pos_clear --
+ * Reset the cursor's location.
+ */
+static inline void
+__cursor_pos_clear(WT_CURSOR_BTREE *cbt)
+{
+ /*
+ * Most of the cursor's location information that needs to be set on
+ * successful return is always set by a successful return, for example,
+ * we don't initialize the compare return value because it's always
+ * set by the row-store search. The other stuff gets cleared here,
+ * and it's a minimal set of things we need to clear. It would be a
+ * lot simpler to clear everything, but we call this function a lot.
+ */
+ cbt->recno = 0;
+
+ cbt->ins = NULL;
+ cbt->ins_head = NULL;
+ cbt->ins_stack[0] = NULL;
+
+ cbt->cip_saved = NULL;
+ cbt->rip_saved = NULL;
+
+ /*
+ * Don't clear the active flag, it's owned by the cursor enter/leave
+ * functions.
+ */
+ F_CLR(cbt, ~WT_CBT_ACTIVE);
+}
+
+/*
+ * __cursor_enter --
+ * Activate a cursor.
+ */
+static inline int
+__cursor_enter(WT_SESSION_IMPL *session)
+{
+ /*
+ * If there are no other cursors positioned in the session, check
+ * whether the cache is full.
+ */
+ if (session->ncursors == 0)
+ WT_RET(__wt_cache_full_check(session));
+ ++session->ncursors;
+ return (0);
+}
+
+/*
+ * __cursor_leave --
+ * Deactivate a cursor.
+ */
+static inline int
+__cursor_leave(WT_SESSION_IMPL *session)
+{
+ /*
+ * Decrement the count of active cursors in the session. When that
+ * goes to zero, there are no active cursors, and we can release any
+ * snapshot we're holding for read committed isolation.
+ */
+ WT_ASSERT(session, session->ncursors > 0);
+ if (--session->ncursors == 0)
+ __wt_txn_read_last(session);
+
+ return (0);
+}
+
+/*
+ * __curfile_enter --
+ * Activate a file cursor.
+ */
+static inline int
+__curfile_enter(WT_CURSOR_BTREE *cbt)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ WT_RET(__cursor_enter(session));
+ F_SET(cbt, WT_CBT_ACTIVE);
+ return (0);
+}
+
+/*
+ * __curfile_leave --
+ * Clear a file cursor's position.
+ */
+static inline int
+__curfile_leave(WT_CURSOR_BTREE *cbt)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ /* If the cursor was active, deactivate it. */
+ if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
+ WT_RET(__cursor_leave(session));
+ F_CLR(cbt, WT_CBT_ACTIVE);
+ }
+
+ /*
+ * Release any page references we're holding. This can trigger
+ * eviction (e.g., forced eviction of big pages), so it is important to
+ * do it after releasing our snapshot above.
+ */
+ WT_RET(__wt_page_release(session, cbt->ref, 0));
+ cbt->ref = NULL;
+ return (0);
+}
+
+/*
+ * __cursor_func_init --
+ * Cursor call setup.
+ */
+static inline int
+__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+
+ if (reenter)
+ WT_RET(__curfile_leave(cbt));
+ if (!F_ISSET(cbt, WT_CBT_ACTIVE))
+ WT_RET(__curfile_enter(cbt));
+ __wt_txn_cursor_op(session);
+ return (0);
+}
+
+/*
+ * __cursor_reset --
+ * Reset the cursor.
+ */
+static inline int
+__cursor_reset(WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+
+ /*
+ * The cursor is leaving the API, and no longer holds any position,
+ * generally called to clean up the cursor after an error.
+ */
+ ret = __curfile_leave(cbt);
+ __cursor_pos_clear(cbt);
+ return (ret);
+}
+
+/*
+ * __cursor_row_slot_return --
+ * Return a row-store leaf page slot's K/V pair.
+ */
+static inline int
+__cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
+{
+ WT_BTREE *btree;
+ WT_ITEM *kb, *vb;
+ WT_CELL *cell;
+ WT_CELL_UNPACK *unpack, _unpack;
+ WT_PAGE *page;
+ WT_SESSION_IMPL *session;
+ void *copy;
+
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ btree = S2BT(session);
+ page = cbt->ref->page;
+
+ unpack = NULL;
+
+ kb = &cbt->iface.key;
+ vb = &cbt->iface.value;
+
+ /*
+ * The row-store key can change underfoot; explicitly take a copy.
+ */
+ copy = WT_ROW_KEY_COPY(rip);
+
+ /*
+ * Get a key: we could just call __wt_row_leaf_key, but as a cursor
+ * is running through the tree, we may have additional information
+ * here (we may have the fully-built key that's immediately before
+ * the prefix-compressed key we want, so it's a faster construction).
+ *
+ * First, check for an immediately available key.
+ */
+ if (__wt_row_leaf_key_info(
+ page, copy, NULL, &cell, &kb->data, &kb->size))
+ goto value;
+
+ /* Huffman encoded keys are a slow path in all cases. */
+ if (btree->huffman_key != NULL)
+ goto slow;
+
+ /*
+ * Unpack the cell and deal with overflow and prefix-compressed keys.
+ * Inline building simple prefix-compressed keys from a previous key,
+ * otherwise build from scratch.
+ */
+ unpack = &_unpack;
+ __wt_cell_unpack(cell, unpack);
+ if (unpack->type == WT_CELL_KEY &&
+ cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) {
+ WT_ASSERT(session, cbt->tmp.size >= unpack->prefix);
+
+ /*
+ * Grow the buffer as necessary as well as ensure data has been
+ * copied into local buffer space, then append the suffix to the
+ * prefix already in the buffer.
+ *
+ * Don't grow the buffer unnecessarily or copy data we don't
+ * need, truncate the item's data length to the prefix bytes.
+ */
+ cbt->tmp.size = unpack->prefix;
+ WT_RET(__wt_buf_grow(
+ session, &cbt->tmp, cbt->tmp.size + unpack->size));
+ memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size,
+ unpack->data, unpack->size);
+ cbt->tmp.size += unpack->size;
+ } else {
+ /*
+ * Call __wt_row_leaf_key_work instead of __wt_row_leaf_key: we
+ * already did __wt_row_leaf_key's fast-path checks inline.
+ */
+slow: WT_RET(
+ __wt_row_leaf_key_work(session, page, rip, &cbt->tmp, 0));
+ }
+ kb->data = cbt->tmp.data;
+ kb->size = cbt->tmp.size;
+ cbt->rip_saved = rip;
+
+value:
+ /*
+ * If the item was ever modified, use the WT_UPDATE data. Note the
+ * caller passes us the update: it has already resolved which one
+ * (if any) is visible.
+ */
+ if (upd != NULL) {
+ vb->data = WT_UPDATE_DATA(upd);
+ vb->size = upd->size;
+ return (0);
+ }
+
+ /* Else, simple values have their location encoded in the WT_ROW. */
+ if (__wt_row_leaf_value(page, rip, vb))
+ return (0);
+
+ /*
+ * Else, take the value from the original page cell (which may be
+ * empty).
+ */
+ if ((cell = __wt_row_leaf_value_cell(page, rip, unpack)) == NULL) {
+ vb->data = "";
+ vb->size = 0;
+ return (0);
+ }
+
+ unpack = &_unpack;
+ __wt_cell_unpack(cell, unpack);
+ return (__wt_page_cell_data_ref(session, cbt->ref->page, unpack, vb));
+}
diff --git a/src/third_party/wiredtiger/src/include/dhandle.h b/src/third_party/wiredtiger/src/include/dhandle.h
new file mode 100644
index 00000000000..5556627c74d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/dhandle.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * XXX
+ * The server threads use their own WT_SESSION_IMPL handles because they may
+ * want to block (for example, the eviction server calls reconciliation, and
+ * some of the reconciliation diagnostic code reads pages), and the user's
+ * session handle is already blocking on a server thread. The problem is the
+ * server thread needs to reference the correct btree handle, and that's
+ * hanging off the application's thread of control. For now, I'm just making
+ * it obvious where that's getting done.
+ */
+#define WT_SET_BTREE_IN_SESSION(s, b) ((s)->dhandle = b->dhandle)
+#define WT_CLEAR_BTREE_IN_SESSION(s) ((s)->dhandle = NULL)
+
+#define WT_WITH_DHANDLE(s, d, e) do { \
+ WT_DATA_HANDLE *__saved_dhandle = (s)->dhandle; \
+ (s)->dhandle = (d); \
+ e; \
+ (s)->dhandle = __saved_dhandle; \
+} while (0)
+
+#define WT_WITH_BTREE(s, b, e) WT_WITH_DHANDLE(s, (b)->dhandle, e)
+
+/*
+ * WT_DATA_HANDLE --
+ * A handle for a generic named data source.
+ */
+struct __wt_data_handle {
+ WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */
+ SLIST_ENTRY(__wt_data_handle) l;/* Linked list of handles */
+
+ /*
+ * Sessions caching a connection's data handle will have a non-zero
+ * reference count; sessions using a connection's data handle will
+ * have a non-zero in-use count.
+ */
+ uint32_t session_ref; /* Sessions referencing this handle */
+ int32_t session_inuse; /* Sessions using this handle */
+ time_t timeofdeath; /* Use count went to 0 */
+
+ uint64_t name_hash; /* Hash of name */
+ const char *name; /* Object name as a URI */
+ const char *checkpoint; /* Checkpoint name (or NULL) */
+ const char **cfg; /* Configuration information */
+
+ WT_DATA_SOURCE *dsrc; /* Data source for this handle */
+ void *handle; /* Generic handle */
+
+ /*
+ * Data handles can be closed without holding the schema lock; threads
+ * walk the list of open handles, operating on them (checkpoint is the
+ * best example). To avoid sources disappearing underneath checkpoint,
+ * lock the data handle when closing it.
+ */
+ WT_SPINLOCK close_lock; /* Lock to close the handle */
+
+ WT_DSRC_STATS stats; /* Data-source statistics */
+
+ /* Flags values over 0xff are reserved for WT_BTREE_* */
+#define WT_DHANDLE_DISCARD 0x01 /* Discard on release */
+#define WT_DHANDLE_DISCARD_CLOSE 0x02 /* Close on release */
+#define WT_DHANDLE_EXCLUSIVE 0x04 /* Need exclusive access */
+#define WT_DHANDLE_HAVE_REF 0x08 /* Already have ref */
+#define WT_DHANDLE_LOCK_ONLY 0x10 /* Handle only used as a lock */
+#define WT_DHANDLE_OPEN 0x20 /* Handle is open */
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/dlh.h b/src/third_party/wiredtiger/src/include/dlh.h
new file mode 100644
index 00000000000..3974ae2792c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/dlh.h
@@ -0,0 +1,15 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_dlh {
+ TAILQ_ENTRY(__wt_dlh) q; /* List of open libraries. */
+
+ void *handle; /* Handle returned by dlopen. */
+ char *name;
+
+ int (*terminate)(WT_CONNECTION *); /* Terminate function. */
+};
diff --git a/src/third_party/wiredtiger/src/include/error.h b/src/third_party/wiredtiger/src/include/error.h
new file mode 100644
index 00000000000..9bccc80faec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/error.h
@@ -0,0 +1,141 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_DEBUG_POINT ((void *)0xdeadbeef)
+#define WT_DEBUG_BYTE (0xab)
+
+/* In DIAGNOSTIC mode, yield in places where we want to encourage races. */
+#ifdef HAVE_DIAGNOSTIC
+#define WT_HAVE_DIAGNOSTIC_YIELD do { \
+ __wt_yield(); \
+} while (0)
+#else
+#define WT_HAVE_DIAGNOSTIC_YIELD
+#endif
+
+/* Set "ret" and branch-to-err-label tests. */
+#define WT_ERR(a) do { \
+ if ((ret = (a)) != 0) \
+ goto err; \
+} while (0)
+#define WT_ERR_MSG(session, v, ...) do { \
+ ret = (v); \
+ __wt_err(session, ret, __VA_ARGS__); \
+ goto err; \
+} while (0)
+#define WT_ERR_BUSY_OK(a) do { \
+ if ((ret = (a)) != 0) { \
+ if (ret == EBUSY) \
+ ret = 0; \
+ else \
+ goto err; \
+ } \
+} while (0)
+#define WT_ERR_NOTFOUND_OK(a) do { \
+ if ((ret = (a)) != 0) { \
+ if (ret == WT_NOTFOUND) \
+ ret = 0; \
+ else \
+ goto err; \
+ } \
+} while (0)
+#define WT_ERR_TEST(a, v) do { \
+ if (a) { \
+ ret = (v); \
+ goto err; \
+ } \
+} while (0)
+
+/* Return tests. */
+#define WT_RET(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0) \
+ return (__ret); \
+} while (0)
+#define WT_RET_TEST(a, v) do { \
+ if (a) \
+ return (v); \
+} while (0)
+#define WT_RET_MSG(session, v, ...) do { \
+ int __ret = (v); \
+ __wt_err(session, __ret, __VA_ARGS__); \
+ return (__ret); \
+} while (0)
+#define WT_RET_BUSY_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != EBUSY) \
+ return (__ret); \
+} while (0)
+#define WT_RET_NOTFOUND_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND) \
+ return (__ret); \
+} while (0)
+/* Set "ret" if not already set. */
+#define WT_TRET(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && \
+ (__ret == WT_PANIC || \
+ ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
+ ret = __ret; \
+} while (0)
+#define WT_TRET_BUSY_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != EBUSY && \
+ (__ret == WT_PANIC || \
+ ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
+ ret = __ret; \
+} while (0)
+#define WT_TRET_NOTFOUND_OK(a) do { \
+ int __ret; \
+ if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND && \
+ (__ret == WT_PANIC || \
+ ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
+ ret = __ret; \
+} while (0)
+
+/* Return and branch-to-err-label cases for switch statements. */
+#define WT_ILLEGAL_VALUE(session) \
+ default: \
+ return (__wt_illegal_value(session, NULL))
+#define WT_ILLEGAL_VALUE_ERR(session) \
+ default: \
+ WT_ERR(__wt_illegal_value(session, NULL))
+#define WT_ILLEGAL_VALUE_SET(session) \
+ default: \
+ ret = __wt_illegal_value(session, NULL); \
+ break
+
+#define WT_PANIC_MSG(session, v, ...) do { \
+ __wt_err(session, v, __VA_ARGS__); \
+ (void)__wt_panic(session); \
+} while (0)
+#define WT_PANIC_ERR(session, v, ...) do { \
+ WT_PANIC_MSG(session, v, __VA_ARGS__); \
+ WT_ERR(WT_PANIC); \
+} while (0)
+#define WT_PANIC_RET(session, v, ...) do { \
+ WT_PANIC_MSG(session, v, __VA_ARGS__); \
+ /* Return WT_PANIC regardless of earlier return codes. */ \
+ return (WT_PANIC); \
+} while (0)
+
+/*
+ * WT_ASSERT
+ * Assert an expression, aborting in diagnostic mode. Otherwise,
+ * "use" the session to keep the compiler quiet and don't evaluate the
+ * expression.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define WT_ASSERT(session, exp) do { \
+ if (!(exp)) \
+ __wt_assert(session, 0, __FILE__, __LINE__, "%s", #exp);\
+} while (0)
+#else
+#define WT_ASSERT(session, exp) \
+ WT_UNUSED(session)
+#endif
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
new file mode 100644
index 00000000000..2ab964475d8
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -0,0 +1,650 @@
+/* DO NOT EDIT: automatically built by dist/s_prototypes. */
+
+extern void __wt_async_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_async_destroy(WT_SESSION_IMPL *session);
+extern int __wt_async_flush(WT_SESSION_IMPL *session);
+extern int __wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP_IMPL **opp);
+extern int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op);
+extern int __wt_async_op_init(WT_SESSION_IMPL *session);
+extern void *__wt_async_worker(void *arg);
+extern int __wt_block_addr_to_buffer(WT_BLOCK *block, uint8_t **pp, wt_off_t offset, uint32_t size, uint32_t cksum);
+extern int __wt_block_buffer_to_addr(WT_BLOCK *block, const uint8_t *p, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump);
+extern int __wt_block_addr_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int live);
+extern int __wt_block_addr_string(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_buffer_to_ckpt(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *p, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci);
+extern int __wt_block_ckpt_init( WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name);
+extern int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, int checkpoint);
+extern int __wt_block_checkpoint_unload( WT_SESSION_IMPL *session, WT_BLOCK *block, int checkpoint);
+extern void __wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci);
+extern int __wt_block_checkpoint(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, int data_cksum);
+extern int __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp);
+extern int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp);
+extern int __wt_block_misplaced(WT_SESSION_IMPL *session, WT_BLOCK *block, const char *tag, wt_off_t offset, uint32_t size, int live);
+extern int __wt_block_off_remove_overlap( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size);
+extern int __wt_block_alloc( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t *offp, wt_off_t size);
+extern int __wt_block_free(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_off_free( WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t offset, wt_off_t size);
+extern int __wt_block_extlist_check( WT_SESSION_IMPL *session, WT_EXTLIST *al, WT_EXTLIST *bl);
+extern int __wt_block_extlist_overlap( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
+extern int __wt_block_extlist_merge(WT_SESSION_IMPL *session, WT_EXTLIST *a, WT_EXTLIST *b);
+extern int __wt_block_insert_ext( WT_SESSION_IMPL *session, WT_EXTLIST *el, wt_off_t off, wt_off_t size);
+extern int __wt_block_extlist_read_avail(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size);
+extern int __wt_block_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, wt_off_t ckpt_size);
+extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el, WT_EXTLIST *additional);
+extern int __wt_block_extlist_truncate( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el);
+extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, WT_EXTLIST *el, const char *name, const char *extname, int track_size);
+extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
+extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie);
+extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie);
+extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp);
+extern int __wt_block_manager_truncate( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
+extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BLOCK **blockp);
+extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
+extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
+extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
+extern int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum);
+extern int __wt_block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp);
+extern void __wt_block_ext_free(WT_SESSION_IMPL *session, WT_EXT *ext);
+extern int __wt_block_size_alloc(WT_SESSION_IMPL *session, WT_SIZE **szp);
+extern void __wt_block_size_free(WT_SESSION_IMPL *session, WT_SIZE *sz);
+extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max);
+extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max);
+extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
+extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
+extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
+extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase);
+extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
+extern int __wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block);
+extern int __wt_block_verify_addr(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size);
+extern u_int __wt_block_header(WT_BLOCK *block);
+extern int __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep);
+extern int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum);
+extern int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked);
+extern int __wt_bloom_create( WT_SESSION_IMPL *session, const char *uri, const char *config, uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp);
+extern int __wt_bloom_open(WT_SESSION_IMPL *session, const char *uri, uint32_t factor, uint32_t k, WT_CURSOR *owner, WT_BLOOM **bloomp);
+extern int __wt_bloom_insert(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_finalize(WT_BLOOM *bloom);
+extern int __wt_bloom_hash(WT_BLOOM *bloom, WT_ITEM *key, WT_BLOOM_HASH *bhash);
+extern int __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash);
+extern int __wt_bloom_get(WT_BLOOM *bloom, WT_ITEM *key);
+extern int __wt_bloom_close(WT_BLOOM *bloom);
+extern int __wt_bloom_drop(WT_BLOOM *bloom, const char *config);
+extern int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
+extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next);
+extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating);
+extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating);
+extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp);
+extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp);
+extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
+extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v);
+extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile);
+extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, const char *ofile);
+extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile);
+extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile);
+extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree_all(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
+extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
+extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages);
+extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages);
+extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
+extern int __wt_evict_create(WT_SESSION_IMPL *session);
+extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
+extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session);
+extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
+extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app);
+extern void __wt_cache_dump(WT_SESSION_IMPL *session);
+extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
+extern int __wt_btree_close(WT_SESSION_IMPL *session);
+extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
+extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
+extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep);
+extern void __wt_btree_evictable(WT_SESSION_IMPL *session, int on);
+extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize);
+extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
+extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
+extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
+extern int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed);
+extern const char *__wt_page_type_string(u_int type);
+extern const char *__wt_cell_type_string(uint8_t type);
+extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
+extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
+extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
+extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
+extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
+extern int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep);
+extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref);
+extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
+extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
+extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
+extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op);
+extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size);
+extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf);
+extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags);
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
+extern int __wt_rec_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
+extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
+extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
+extern int __wt_split_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive);
+extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell);
+extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size);
+extern int __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size);
+extern void __wt_ovfl_reuse_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_txnc_search( WT_PAGE *page, const uint8_t *addr, size_t addr_size, WT_ITEM *store);
+extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size);
+extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_rec_write(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags);
+extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_fix(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_bulk_insert_var(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk);
+extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
+extern int __wt_row_leaf_key_work(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, int instantiate);
+extern int __wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, void *ikeyp);
+extern int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep);
+extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep);
+extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd);
+extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd);
+extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
+extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
+extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
+extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
+extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
+extern int __wt_config_next(WT_CONFIG *conf, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_get(WT_SESSION_IMPL *session, const char **cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_gets(WT_SESSION_IMPL *session, const char **cfg, const char *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_getone(WT_SESSION_IMPL *session, const char *config, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_getones(WT_SESSION_IMPL *session, const char *config, const char *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_gets_def(WT_SESSION_IMPL *session, const char **cfg, const char *key, int def, WT_CONFIG_ITEM *value);
+extern int __wt_config_subgetraw(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+extern int __wt_config_subgets(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cfg, const char *key, WT_CONFIG_ITEM *value);
+extern void __wt_conn_foc_discard(WT_SESSION_IMPL *session);
+extern int __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *uri, const char *config, const char *type, const char *check);
+extern int __wt_config_check(WT_SESSION_IMPL *session, const WT_CONFIG_ENTRY *entry, const char *config, size_t config_len);
+extern int __wt_config_collapse( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_config_merge( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_config_concat( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret);
+extern int __wt_conn_config_init(WT_SESSION_IMPL *session);
+extern void __wt_conn_config_discard(WT_SESSION_IMPL *session);
+extern int __wt_ext_config_parser_open(WT_EXTENSION_API *wt_ext, WT_SESSION *wt_session, const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+extern int __wt_ext_config_get(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_CONFIG_ARG *cfg_arg, const char *key, WT_CONFIG_ITEM *cval);
+extern int __wt_config_upgrade(WT_SESSION_IMPL *session, WT_ITEM *buf);
+extern int __wt_collator_config(WT_SESSION_IMPL *session, const char **cfg, WT_COLLATOR **collatorp, int *ownp);
+extern int __wt_conn_remove_collator(WT_SESSION_IMPL *session);
+extern int __wt_conn_remove_compressor(WT_SESSION_IMPL *session);
+extern int __wt_conn_remove_data_source(WT_SESSION_IMPL *session);
+extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_cache_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_cache_destroy(WT_SESSION_IMPL *session);
+extern int __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session);
+extern int __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session);
+extern void *__wt_cache_pool_server(void *arg);
+extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session);
+extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize);
+extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force);
+extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, const char *op_cfg[], uint32_t flags);
+extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, int apply_checkpoints, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern void __wt_conn_btree_close(WT_SESSION_IMPL *session);
+extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *name, int force);
+extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final);
+extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
+extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
+extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
+extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
+extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]);
+extern int __wt_connection_close(WT_CONNECTION_IMPL *conn);
+extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_conn_stat_init(WT_SESSION_IMPL *session);
+extern int __wt_statlog_log_one(WT_SESSION_IMPL *session);
+extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close);
+extern int __wt_sweep_create(WT_SESSION_IMPL *session);
+extern int __wt_sweep_destroy(WT_SESSION_IMPL *session);
+extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, int *skip);
+extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, int bitmap, int skip_sort_check);
+extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp);
+extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp);
+extern int __wt_curfile_update_check(WT_CURSOR *cursor);
+extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp);
+extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, int iskey, va_list ap);
+extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor);
+extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode);
+extern int __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf);
+extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen);
+extern const char *__wt_json_tokname(int toktype);
+extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item);
+extern ssize_t __wt_json_strlen(const char *src, size_t srclen);
+extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen);
+extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst);
+extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_cursor_notsup(WT_CURSOR *cursor);
+extern int __wt_cursor_noop(WT_CURSOR *cursor);
+extern void __wt_cursor_set_notsup(WT_CURSOR *cursor);
+extern int __wt_cursor_config_readonly(WT_CURSOR *cursor, const char *cfg[], int def);
+extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, int key);
+extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...);
+extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...);
+extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern void __wt_cursor_set_raw_key(WT_CURSOR *cursor, WT_ITEM *key);
+extern int __wt_cursor_get_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern void __wt_cursor_set_raw_value(WT_CURSOR *cursor, WT_ITEM *value);
+extern int __wt_cursor_get_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
+extern void __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap);
+extern int __wt_cursor_get_value(WT_CURSOR *cursor, ...);
+extern int __wt_cursor_get_valuev(WT_CURSOR *cursor, va_list ap);
+extern void __wt_cursor_set_value(WT_CURSOR *cursor, ...);
+extern void __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap);
+extern int __wt_cursor_close(WT_CURSOR *cursor);
+extern int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor);
+extern int __wt_cursor_init(WT_CURSOR *cursor, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curtable_get_key(WT_CURSOR *cursor, ...);
+extern int __wt_curtable_get_value(WT_CURSOR *cursor, ...);
+extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...);
+extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...);
+extern int __wt_table_range_truncate(WT_CURSOR_TABLE *start, WT_CURSOR_TABLE *stop);
+extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
+extern int __wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp);
+extern int __wt_log_get_active_files( WT_SESSION_IMPL *session, char ***filesp, u_int *countp);
+extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
+extern int __wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf);
+extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
+extern int __wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum);
+extern int __wt_log_open(WT_SESSION_IMPL *session);
+extern int __wt_log_close(WT_SESSION_IMPL *session);
+extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create);
+extern int __wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
+extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie);
+extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
+extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
+extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp);
+extern void __wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp);
+extern int __wt_logrec_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *rectypep);
+extern int __wt_logop_read(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *optypep, uint32_t *opsizep);
+extern int __wt_logop_col_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno, WT_ITEM *value);
+extern int __wt_logop_col_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep);
+extern int __wt_logop_col_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t recno);
+extern int __wt_logop_col_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *recnop);
+extern int __wt_logop_col_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_col_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, uint64_t start, uint64_t stop);
+extern int __wt_logop_col_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, uint64_t *startp, uint64_t *stopp);
+extern int __wt_logop_col_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_put_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key, WT_ITEM *value);
+extern int __wt_logop_row_put_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep);
+extern int __wt_logop_row_put_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_remove_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *key);
+extern int __wt_logop_row_remove_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *keyp);
+extern int __wt_logop_row_remove_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logrec, uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode);
+extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
+extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
+extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp);
+extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
+extern int __wt_log_slot_free(WT_LOGSLOT *slot);
+extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
+extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
+extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg);
+extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session);
+extern void __wt_lsm_manager_free_work_unit( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry);
+extern int __wt_lsm_manager_destroy(WT_SESSION_IMPL *session);
+extern int __wt_lsm_manager_clear_tree( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_manager_pop_entry( WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp);
+extern int __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id);
+extern int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_curstat_lsm_init( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst);
+extern int __wt_lsm_tree_close_all(WT_SESSION_IMPL *session);
+extern int __wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp);
+extern int __wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp);
+extern int __wt_lsm_tree_set_chunk_size( WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config);
+extern int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep);
+extern void __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only);
+extern int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, const char *name, const char *cfg[]);
+extern int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[]);
+extern int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]);
+extern int __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip);
+extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags);
+extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp);
+extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran);
+extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk);
+extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree);
+extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args);
+extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]);
+extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt);
+extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep);
+extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname);
+extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep);
+extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn);
+extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase);
+extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt);
+extern int __wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
+extern int __wt_ext_metadata_remove( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key);
+extern int __wt_ext_metadata_search(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char **valuep);
+extern int __wt_ext_metadata_update(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value);
+extern int __wt_metadata_get_ckptlist( WT_SESSION *session, const char *name, WT_CKPT **ckptbasep);
+extern void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase);
+extern int __wt_metadata_open(WT_SESSION_IMPL *session);
+extern int __wt_metadata_cursor( WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp);
+extern int __wt_metadata_insert( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_metadata_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern int __wt_metadata_remove(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_metadata_search( WT_SESSION_IMPL *session, const char *key, const char **valuep);
+extern void __wt_meta_track_discard(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_off(WT_SESSION_IMPL *session, int unroll);
+extern int __wt_meta_track_sub_on(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_sub_off(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session);
+extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key);
+extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri);
+extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created);
+extern int __wt_turtle_init(WT_SESSION_IMPL *session);
+extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep);
+extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
+extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_ATTRIBUTE((noreturn));
+extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp);
+extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
+extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp);
+extern int __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp);
+extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg);
+extern int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp);
+extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp);
+extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, int fail, void *sym_ret);
+extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh);
+extern int __wt_errno(void);
+extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp);
+extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len);
+extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep);
+extern int __wt_filesize_name( WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep);
+extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock);
+extern int __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len);
+extern int __wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp);
+extern int __wt_getopt( const char *progname, int nargc, char *const *nargv, const char *ostr);
+extern int __wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie);
+extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size);
+extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size);
+extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
+extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp);
+extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs);
+extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
+extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
+extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name);
+extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
+extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
+extern int __wt_once(void (*init_routine)(void));
+extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp);
+extern int __wt_close(WT_SESSION_IMPL *session, WT_FH *fh);
+extern int __wt_absolute_path(const char *path);
+extern const char *__wt_path_separator(void);
+extern int __wt_has_priv(void);
+extern int __wt_remove(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to);
+extern int __wt_read( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf);
+extern int __wt_write(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, const void *buf);
+extern void __wt_sleep(long seconds, long micro_seconds);
+extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base);
+extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, void *(*func)(void *), void *arg);
+extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid);
+extern void __wt_thread_id(char *buf, size_t buflen);
+extern int __wt_seconds(WT_SESSION_IMPL *session, time_t *timep);
+extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
+extern void __wt_yield(void);
+extern int __wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t *sizep, const char *fmt, ...);
+extern int __wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_check(WT_SESSION_IMPL *session, const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp);
+extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...);
+extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...);
+extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep);
+extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf);
+extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf);
+extern int __wt_schema_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
+extern int __wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_schema_get_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep);
+extern void __wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup);
+extern void __wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx);
+extern void __wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_remove_table( WT_SESSION_IMPL *session, WT_TABLE *table);
+extern void __wt_schema_close_tables(WT_SESSION_IMPL *session);
+extern int __wt_schema_colgroup_name(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf);
+extern int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp);
+extern int __wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, WT_TABLE **tablep);
+extern int __wt_schema_get_colgroup(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp);
+extern int __wt_schema_get_index(WT_SESSION_IMPL *session, const char *uri, WT_TABLE **tablep, WT_INDEX **indexp);
+extern int __wt_schema_colcheck(WT_SESSION_IMPL *session, const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf, u_int *kcolsp, u_int *vcolsp);
+extern int __wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table);
+extern int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, int value_only, WT_ITEM *plan);
+extern int __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, const char *extra_cols, int value_only, WT_ITEM *format);
+extern int __wt_struct_truncate(WT_SESSION_IMPL *session, const char *input_fmt, u_int ncols, WT_ITEM *format);
+extern int __wt_schema_project_in(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap);
+extern int __wt_schema_project_out(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap);
+extern int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value);
+extern int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value);
+extern int __wt_schema_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri, const char *cfg[]);
+extern int __wt_curstat_colgroup_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_index_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_curstat_table_init(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR_STAT *cst);
+extern int __wt_schema_truncate( WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
+extern int __wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop);
+extern int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop);
+extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str);
+extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len);
+extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags);
+extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session);
+extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
+extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_session_create_strip(WT_SESSION *wt_session, const char *v1, const char *v2, const char **value_ret);
+extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
+extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp);
+extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
+extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config);
+extern void __wt_session_dhandle_incr_use(WT_SESSION_IMPL *session);
+extern int __wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags);
+extern int __wt_session_release_btree(WT_SESSION_IMPL *session);
+extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags);
+extern void __wt_session_close_cache(WT_SESSION_IMPL *session);
+extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags);
+extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint);
+extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
+extern uint32_t __wt_cksum(const void *chunk, size_t len);
+extern void __wt_cksum_init(void);
+extern void __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler);
+extern int __wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error, const char *file_name, int line_number, const char *fmt, va_list ap);
+extern void __wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern void __wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+extern int __wt_ext_err_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 2, 3)));
+extern int __wt_ext_msg_printf( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v);
+extern void __wt_assert(WT_SESSION_IMPL *session, int error, const char *file_name, int line_number, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 5, 6)));
+extern int __wt_panic(WT_SESSION_IMPL *session);
+extern int __wt_illegal_value(WT_SESSION_IMPL *session, const char *name);
+extern int __wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri);
+extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path);
+extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path);
+extern int __wt_library_init(void);
+extern int __wt_breakpoint(void);
+extern void __wt_attach(WT_SESSION_IMPL *session);
+extern uint64_t __wt_hash_city64(const void *s, size_t len);
+extern uint64_t __wt_hash_fnv64(const void *string, size_t len);
+extern int
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern void __wt_hazard_close(WT_SESSION_IMPL *session);
+extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
+extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to);
+extern int __wt_hex2byte(const u_char *from, u_char *to);
+extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to);
+extern int __wt_nhex_to_raw( WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to);
+extern int __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to);
+extern int __wt_huffman_open(WT_SESSION_IMPL *session, void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp);
+extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
+extern int __wt_print_huffman_code(void *huffman_arg, uint16_t symbol);
+extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf);
+extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf);
+extern int __wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
+extern void __wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
+extern int __wt_spin_lock_register_caller(WT_SESSION_IMPL *session, const char *name, const char *file, int line, int *idp);
+extern int __wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag);
+extern uint32_t __wt_nlpo2_round(uint32_t v);
+extern uint32_t __wt_nlpo2(uint32_t v);
+extern uint32_t __wt_log2_int(uint32_t n);
+extern int __wt_ispo2(uint32_t v);
+extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
+extern void __wt_random_init(uint32_t *rnd);
+extern uint32_t __wt_random(uint32_t *rnd);
+extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size);
+extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_ATTRIBUTE((format (printf, 3, 4)));
+extern int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ );
+extern void __wt_scr_discard(WT_SESSION_IMPL *session);
+extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size);
+extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p);
+extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats);
+extern void __wt_stat_refresh_dsrc_stats(void *stats_arg);
+extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent);
+extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_refresh_connection_stats(void *stats_arg);
+extern int __wt_txnid_cmp(const void *v1, const void *v2);
+extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
+extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session);
+extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot);
+extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_release(WT_SESSION_IMPL *session);
+extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_init(WT_SESSION_IMPL *session);
+extern void __wt_txn_stats_update(WT_SESSION_IMPL *session);
+extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
+extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session);
+extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len);
+extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_checkpoint_close(WT_SESSION_IMPL *session, int force);
+extern uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session);
+extern int __wt_ext_transaction_isolation_level( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session);
+extern int __wt_ext_transaction_notify( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify);
+extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api);
+extern int __wt_ext_transaction_visible( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id);
+extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op);
+extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]);
+extern int __wt_txn_checkpoint_logread( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn);
+extern int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp);
+extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session);
+extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out);
+extern int __wt_txn_recover(WT_CONNECTION_IMPL *conn);
diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h
new file mode 100644
index 00000000000..3aac7193407
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/flags.h
@@ -0,0 +1,88 @@
+/*
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ * flags section: BEGIN
+ */
+#define WT_CONN_CACHE_POOL 0x00001000
+#define WT_CONN_CKPT_SYNC 0x00000800
+#define WT_CONN_EVICTION_RUN 0x00000400
+#define WT_CONN_LEAK_MEMORY 0x00000200
+#define WT_CONN_LSM_MERGE 0x00000100
+#define WT_CONN_PANIC 0x00000080
+#define WT_CONN_SERVER_ASYNC 0x00000040
+#define WT_CONN_SERVER_CHECKPOINT 0x00000020
+#define WT_CONN_SERVER_LSM 0x00000010
+#define WT_CONN_SERVER_RUN 0x00000008
+#define WT_CONN_SERVER_STATISTICS 0x00000004
+#define WT_CONN_SERVER_SWEEP 0x00000002
+#define WT_CONN_WAS_BACKUP 0x00000001
+#define WT_EVICTING 0x00000004
+#define WT_FILE_TYPE_CHECKPOINT 0x00000004
+#define WT_FILE_TYPE_DATA 0x00000002
+#define WT_FILE_TYPE_LOG 0x00000001
+#define WT_LOGSCAN_FIRST 0x00000008
+#define WT_LOGSCAN_FROM_CKP 0x00000004
+#define WT_LOGSCAN_ONE 0x00000002
+#define WT_LOGSCAN_RECOVER 0x00000001
+#define WT_LOG_DSYNC 0x00000004
+#define WT_LOG_FLUSH 0x00000002
+#define WT_LOG_FSYNC 0x00000001
+#define WT_READ_CACHE 0x00000200
+#define WT_READ_COMPACT 0x00000100
+#define WT_READ_NO_EVICT 0x00000080
+#define WT_READ_NO_GEN 0x00000040
+#define WT_READ_NO_WAIT 0x00000020
+#define WT_READ_PREV 0x00000010
+#define WT_READ_SKIP_INTL 0x00000008
+#define WT_READ_SKIP_LEAF 0x00000004
+#define WT_READ_TRUNCATE 0x00000002
+#define WT_READ_WONT_NEED 0x00000001
+#define WT_SESSION_CAN_WAIT 0x00000800
+#define WT_SESSION_DISCARD_FORCE 0x00000400
+#define WT_SESSION_INTERNAL 0x00000200
+#define WT_SESSION_LOGGING_INMEM 0x00000100
+#define WT_SESSION_NO_CACHE 0x00000080
+#define WT_SESSION_NO_CACHE_CHECK 0x00000040
+#define WT_SESSION_NO_DATA_HANDLES 0x00000020
+#define WT_SESSION_NO_LOGGING 0x00000010
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00000008
+#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00000004
+#define WT_SESSION_SCHEMA_LOCKED 0x00000002
+#define WT_SESSION_SERVER_ASYNC 0x00000001
+#define WT_SKIP_UPDATE_ERR 0x00000002
+#define WT_SKIP_UPDATE_RESTORE 0x00000001
+#define WT_SYNC_CHECKPOINT 0x00000010
+#define WT_SYNC_CLOSE 0x00000008
+#define WT_SYNC_DISCARD 0x00000004
+#define WT_SYNC_DISCARD_FORCE 0x00000002
+#define WT_SYNC_WRITE_LEAVES 0x00000001
+#define WT_TXN_LOG_CKPT_FAIL 0x00000008
+#define WT_TXN_LOG_CKPT_PREPARE 0x00000004
+#define WT_TXN_LOG_CKPT_START 0x00000002
+#define WT_TXN_LOG_CKPT_STOP 0x00000001
+#define WT_VERB_API 0x00400000
+#define WT_VERB_BLOCK 0x00200000
+#define WT_VERB_CHECKPOINT 0x00100000
+#define WT_VERB_COMPACT 0x00080000
+#define WT_VERB_EVICT 0x00040000
+#define WT_VERB_EVICTSERVER 0x00020000
+#define WT_VERB_FILEOPS 0x00010000
+#define WT_VERB_LOG 0x00008000
+#define WT_VERB_LSM 0x00004000
+#define WT_VERB_METADATA 0x00002000
+#define WT_VERB_MUTEX 0x00001000
+#define WT_VERB_OVERFLOW 0x00000800
+#define WT_VERB_READ 0x00000400
+#define WT_VERB_RECONCILE 0x00000200
+#define WT_VERB_RECOVERY 0x00000100
+#define WT_VERB_SALVAGE 0x00000080
+#define WT_VERB_SHARED_CACHE 0x00000040
+#define WT_VERB_SPLIT 0x00000020
+#define WT_VERB_TEMPORARY 0x00000010
+#define WT_VERB_TRANSACTION 0x00000008
+#define WT_VERB_VERIFY 0x00000004
+#define WT_VERB_VERSION 0x00000002
+#define WT_VERB_WRITE 0x00000001
+/*
+ * flags section: END
+ * DO NOT EDIT: automatically built by dist/flags.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h
new file mode 100644
index 00000000000..50e237a1fed
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/gcc.h
@@ -0,0 +1,152 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Add GCC-specific attributes to types and function declarations. */
+#define WT_GCC_ATTRIBUTE(x) __attribute__(x)
+
+/*
+ * Attribute are only permitted on function declarations, not definitions.
+ * This macro is a marker for function definitions that is rewritten by
+ * dist/s_prototypes to create extern.h.
+ */
+#define WT_GCC_FUNC_ATTRIBUTE(x)
+
+/*
+ * Atomic writes:
+ *
+ * WiredTiger requires pointers (void *) and some variables to be read/written
+ * atomically, that is, in a single cycle. This is not write ordering -- to be
+ * clear, the requirement is that no partial value can ever be read or written.
+ * For example, if 8-bits of a 32-bit quantity were written, then the rest of
+ * the 32-bits were written, and another thread of control was able to read the
+ * memory location after the first 8-bits were written and before the subsequent
+ * 24-bits were written, WiredTiger would break. Or, if two threads of control
+ * attempt to write the same location simultaneously, the result must be one or
+ * the other of the two values, not some combination of both.
+ *
+ * To reduce memory requirements, we use a 32-bit type on 64-bit machines, which
+ * is OK if the compiler doesn't accumulate two adjacent 32-bit variables into a
+ * single 64-bit write, that is, there needs to be a single load/store of the 32
+ * bits, not a load/store of 64 bits, where the 64 bits is comprised of two
+ * adjacent 32-bit locations. The problem is when two threads are cooperating
+ * (thread X finds 32-bits set to 0, writes in a new value, flushes memory;
+ * thread Y reads 32-bits that are non-zero, does some operation, resets the
+ * memory location to 0 and flushes). If thread X were to read the 32 bits
+ * adjacent to a different 32 bits, and write them both, the two threads could
+ * race. If that can happen, you must increase the size of the memory type to
+ * a type guaranteed to be written atomically in a single cycle, without writing
+ * an adjacent memory location.
+ *
+ * WiredTiger additionally requires atomic writes for 64-bit memory locations,
+ * and so cannot run on machines with a 32-bit memory bus.
+ *
+ * We don't depend on writes across cache lines being atomic, and to make sure
+ * that never happens, we check address alignment: we know of no architectures
+ * with cache lines other than a multiple of 4 bytes in size, so aligned 4-byte
+ * accesses will always be in a single cache line.
+ *
+ * Atomic writes are often associated with memory barriers, implemented by the
+ * WT_READ_BARRIER and WT_WRITE_BARRIER macros. WiredTiger's requirement as
+ * described by the Solaris membar_enter description:
+ *
+ * No stores from after the memory barrier will reach visibility and
+ * no loads from after the barrier will be resolved before the lock
+ * acquisition reaches global visibility
+ *
+ * In other words, the WT_WRITE_BARRIER macro must ensure that memory stores by
+ * the processor, made before the WT_WRITE_BARRIER call, be visible to all
+ * processors in the system before any memory stores by the processor, made
+ * after the WT_WRITE_BARRIER call, are visible to any processor. The
+ * WT_READ_BARRIER macro ensures that all loads before the barrier are complete
+ * before any loads after the barrier. The compiler cannot reorder or cache
+ * values across a barrier.
+ *
+ * Lock and unlock operations imply both read and write barriers. In other
+ * words, barriers are not required for values protected by locking.
+ *
+ * Data locations may also be marked volatile, forcing the compiler to re-load
+ * the data on each access. This is a weaker semantic than barriers provide,
+ * only ensuring that the compiler will not cache values. It makes no ordering
+ * guarantees and may have no effect on systems with weaker cache guarantees.
+ *
+ * In summary, locking > barriers > volatile.
+ *
+ * To avoid locking shared data structures such as statistics and to permit
+ * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS
+ * (compare and swap) operations.
+ */
+#define __WT_ATOMIC_ADD(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val))
+#define __WT_ATOMIC_FETCH_ADD(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val))
+#define __WT_ATOMIC_CAS(v, old, new, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ __sync_bool_compare_and_swap(&(v), old, new))
+#define __WT_ATOMIC_CAS_VAL(v, old, new, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ __sync_val_compare_and_swap(&(v), old, new))
+#define __WT_ATOMIC_STORE(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ __sync_lock_test_and_set(&(v), val))
+#define __WT_ATOMIC_SUB(v, val, n) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val))
+
+#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1)
+#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 1)
+#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1)
+#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 1)
+#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1)
+#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1)
+
+#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2)
+#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 2)
+#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new, 2)
+#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 2)
+#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2)
+#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2)
+
+#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4)
+#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4)
+#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4)
+#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 4)
+#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4)
+#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4)
+
+#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8)
+#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 8)
+#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new, 8)
+#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new, 8)
+#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val, 8)
+#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8)
+
+/* Compile read-write barrier */
+#define WT_BARRIER() __asm__ volatile("" ::: "memory")
+
+/* Pause instruction to prevent excess processor bus usage */
+#define WT_PAUSE() __asm__ volatile("pause\n" ::: "memory")
+
+#if defined(x86_64) || defined(__x86_64__)
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("mfence" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() do { \
+ __asm__ volatile ("lfence" ::: "memory"); \
+} while (0)
+#define WT_WRITE_BARRIER() do { \
+ __asm__ volatile ("sfence" ::: "memory"); \
+} while (0)
+
+#elif defined(i386) || defined(__i386__)
+#define WT_FULL_BARRIER() do { \
+ __asm__ volatile ("lock; addl $0, 0(%%esp)" ::: "memory"); \
+} while (0)
+#define WT_READ_BARRIER() WT_FULL_BARRIER()
+#define WT_WRITE_BARRIER() WT_FULL_BARRIER()
+
+#else
+#error "No write barrier implementation for this hardware"
+#endif
diff --git a/src/third_party/wiredtiger/src/include/hardware.h b/src/third_party/wiredtiger/src/include/hardware.h
new file mode 100644
index 00000000000..720f512cf2d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/hardware.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Publish a value to a shared location. All previous stores must complete
+ * before the value is made public.
+ */
+#define WT_PUBLISH(v, val) do { \
+ WT_WRITE_BARRIER(); \
+ (v) = (val); \
+} while (0)
+
+/*
+ * Read a shared location and guarantee that subsequent reads do not see any
+ * earlier state.
+ */
+#define WT_ORDERED_READ(v, val) do { \
+ (v) = (val); \
+ WT_READ_BARRIER(); \
+} while (0)
+
+/*
+ * Atomic versions of the flag set/clear macros.
+ */
+#define F_ISSET_ATOMIC(p, mask) ((p)->flags_atomic & (uint8_t)(mask))
+
+#define F_SET_ATOMIC(p, mask) do { \
+ uint8_t __orig; \
+ do { \
+ __orig = (p)->flags_atomic; \
+ } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
+ __orig, __orig | (uint8_t)(mask))); \
+} while (0)
+
+#define F_CAS_ATOMIC(p, mask, ret) do { \
+ uint8_t __orig; \
+ ret = 0; \
+ do { \
+ __orig = (p)->flags_atomic; \
+ if ((__orig & (uint8_t)(mask)) != 0) { \
+ ret = EBUSY; \
+ break; \
+ } \
+ } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
+ __orig, __orig | (uint8_t)(mask))); \
+} while (0)
+
+#define F_CLR_ATOMIC(p, mask) do { \
+ uint8_t __orig; \
+ do { \
+ __orig = (p)->flags_atomic; \
+ } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
+ __orig, __orig & ~(uint8_t)(mask))); \
+} while (0)
+
+#define WT_CACHE_LINE_ALIGNMENT 64 /* Cache line alignment */
diff --git a/src/third_party/wiredtiger/src/include/intpack.i b/src/third_party/wiredtiger/src/include/intpack.i
new file mode 100644
index 00000000000..01559657acd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/intpack.i
@@ -0,0 +1,371 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Variable-length integer encoding.
+ * We need up to 64 bits, signed and unsigned. Further, we want the packed
+ * representation to have the same lexicographic ordering as the integer
+ * values. This avoids the need for special-purpose comparison code.
+ *
+ * Try hard to keep small values small (up to ~2 bytes): that gives the biggest
+ * benefit for common cases storing small values. After that, just encode the
+ * length in the first byte: we could squeeze in a couple of extra bits, but
+ * the marginal benefit is small, and we want this code to be relatively
+ * easy to implement in client code or scripting APIs.
+ *
+ * First byte | Next | |
+ * byte | bytes| Min Value | Max Value
+ * ------------+------+------------------------+--------------------------------
+ * [00 00xxxx] | free | N/A | N/A
+ * [00 01llll] | llll | -2^64 | -2^13 - 2^6
+ * [00 1xxxxx] | 1 | -2^13 - 2^6 | -2^6 - 1
+ * [01 xxxxxx] | 0 | -2^6 | -1
+ * [10 xxxxxx] | 0 | 0 | 2^6 - 1
+ * [11 0xxxxx] | 1 | 2^6 | 2^13 + 2^6 - 1
+ * [11 10llll] | llll | 2^13 + 2^6 | 2^64 - 1
+ * [11 11xxxx] | free | N/A | N/A
+ */
+
+#define NEG_MULTI_MARKER (uint8_t)0x10
+#define NEG_2BYTE_MARKER (uint8_t)0x20
+#define NEG_1BYTE_MARKER (uint8_t)0x40
+#define POS_1BYTE_MARKER (uint8_t)0x80
+#define POS_2BYTE_MARKER (uint8_t)0xc0
+#define POS_MULTI_MARKER (uint8_t)0xe0
+
+#define NEG_1BYTE_MIN ((-1) << 6)
+#define NEG_2BYTE_MIN (((-1) << 13) + NEG_1BYTE_MIN)
+#define POS_1BYTE_MAX ((1 << 6) - 1)
+#define POS_2BYTE_MAX ((1 << 13) + POS_1BYTE_MAX)
+
+/* Extract bits <start> to <end> from a value (counting from LSB == 0). */
+#define GET_BITS(x, start, end) \
+ (((uint64_t)(x) & ((1U << (start)) - 1U)) >> (end))
+
+#define WT_SIZE_CHECK(l, maxl) \
+ WT_RET_TEST((maxl) != 0 && (size_t)(l) > (maxl), ENOMEM)
+
+/* Count the leading zero bytes. */
+#if defined(__GNUC__)
+#define WT_LEADING_ZEROS(x, i) \
+ (i = (x == 0) ? (int)sizeof (x) : __builtin_clzll(x) >> 3)
+#elif defined(_MSC_VER)
+#define WT_LEADING_ZEROS(x, i) do { \
+ if (x == 0) i = (int)sizeof(x); \
+ else { \
+ unsigned long __index; \
+ _BitScanReverse64(&__index, x); \
+ __index = 63 ^ __index; \
+ i = (int)(__index >> 3); } \
+ } while (0)
+#else
+#define WT_LEADING_ZEROS(x, i) do { \
+ uint64_t __x = (x); \
+ uint64_t __m = (uint64_t)0xff << 56; \
+ for (i = 0; !(__x & __m) && i != 8; i++) \
+ __m >>= 8; \
+} while (0)
+#endif
+
+/*
+ * __wt_vpack_posint --
+ * Packs a positive variable-length integer in the specified location.
+ */
+static inline int
+__wt_vpack_posint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+ uint8_t *p;
+ int len, lz, shift;
+
+ WT_LEADING_ZEROS(x, lz);
+ len = (int)sizeof (x) - lz;
+ WT_SIZE_CHECK(len + 1, maxlen);
+ p = *pp;
+
+ /* There are four bits we can use in the first byte. */
+ *p++ |= (len & 0xf);
+
+ for (shift = (len - 1) << 3; len != 0; --len, shift -= 8)
+ *p++ = (uint8_t)(x >> shift);
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vpack_negint --
+ * Packs a negative variable-length integer in the specified location.
+ */
+static inline int
+__wt_vpack_negint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+ uint8_t *p;
+ int len, lz, shift;
+
+ WT_LEADING_ZEROS(~x, lz);
+ len = (int)sizeof (x) - lz;
+ WT_SIZE_CHECK(len + 1, maxlen);
+ p = *pp;
+
+ /*
+ * There are four size bits we can use in the first byte.
+ * For negative numbers, we store the number of leading 0xff bytes
+ * to maintain ordering (if this is not obvious, it may help to
+ * remember that -1 is the largest negative number).
+ */
+ *p++ |= (lz & 0xf);
+
+ for (shift = (len - 1) << 3; len != 0; shift -= 8, --len)
+ *p++ = (uint8_t)(x >> shift);
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_posint --
+ * Reads a variable-length positive integer from the specified location.
+ */
+static inline int
+__wt_vunpack_posint(const uint8_t **pp, size_t maxlen, uint64_t *retp)
+{
+ uint64_t x;
+ const uint8_t *p;
+ uint8_t len;
+
+ /* There are four length bits in the first byte. */
+ p = *pp;
+ len = (*p++ & 0xf);
+ WT_SIZE_CHECK(len + 1, maxlen);
+
+ for (x = 0; len != 0; --len)
+ x = (x << 8) | *p++;
+
+ *retp = x;
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_negint --
+ * Reads a variable-length negative integer from the specified location.
+ */
+static inline int
+__wt_vunpack_negint(const uint8_t **pp, size_t maxlen, uint64_t *retp)
+{
+ uint64_t x;
+ const uint8_t *p;
+ uint8_t len;
+
+ /* There are four length bits in the first byte. */
+ p = *pp;
+ len = (int)sizeof (x) - (*p++ & 0xf);
+ WT_SIZE_CHECK(len + 1, maxlen);
+
+ for (x = UINT64_MAX; len != 0; --len)
+ x = (x << 8) | *p++;
+
+ *retp = x;
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vpack_uint --
+ * Variable-sized packing for unsigned integers
+ */
+static inline int
+__wt_vpack_uint(uint8_t **pp, size_t maxlen, uint64_t x)
+{
+ uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ if (x <= POS_1BYTE_MAX)
+ *p++ = POS_1BYTE_MARKER | GET_BITS(x, 6, 0);
+ else if (x <= POS_2BYTE_MAX) {
+ WT_SIZE_CHECK(2, maxlen);
+ x -= POS_1BYTE_MAX + 1;
+ *p++ = POS_2BYTE_MARKER | GET_BITS(x, 13, 8);
+ *p++ = GET_BITS(x, 8, 0);
+ } else {
+ x -= POS_2BYTE_MAX + 1;
+ *p = POS_MULTI_MARKER;
+ return (__wt_vpack_posint(pp, maxlen, x));
+ }
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vpack_int --
+ * Variable-sized packing for signed integers
+ */
+static inline int
+__wt_vpack_int(uint8_t **pp, size_t maxlen, int64_t x)
+{
+ uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ if (x < NEG_2BYTE_MIN) {
+ *p = NEG_MULTI_MARKER;
+ return (__wt_vpack_negint(pp, maxlen, (uint64_t)x));
+ } else if (x < NEG_1BYTE_MIN) {
+ WT_SIZE_CHECK(2, maxlen);
+ x -= NEG_2BYTE_MIN;
+ *p++ = NEG_2BYTE_MARKER | GET_BITS(x, 13, 8);
+ *p++ = GET_BITS(x, 8, 0);
+ } else if (x < 0) {
+ x -= NEG_1BYTE_MIN;
+ *p++ = NEG_1BYTE_MARKER | GET_BITS(x, 6, 0);
+ } else
+ /* For non-negative values, use the unsigned code above. */
+ return (__wt_vpack_uint(pp, maxlen, (uint64_t)x));
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_uint --
+ * Variable-sized unpacking for unsigned integers
+ */
+static inline int
+__wt_vunpack_uint(const uint8_t **pp, size_t maxlen, uint64_t *xp)
+{
+ const uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ switch (*p & 0xf0) {
+ case POS_1BYTE_MARKER:
+ case POS_1BYTE_MARKER | 0x10:
+ case POS_1BYTE_MARKER | 0x20:
+ case POS_1BYTE_MARKER | 0x30:
+ *xp = GET_BITS(*p, 6, 0);
+ p += 1;
+ break;
+ case POS_2BYTE_MARKER:
+ case POS_2BYTE_MARKER | 0x10:
+ WT_SIZE_CHECK(2, maxlen);
+ *xp = GET_BITS(*p++, 5, 0) << 8;
+ *xp |= *p++;
+ *xp += POS_1BYTE_MAX + 1;
+ break;
+ case POS_MULTI_MARKER:
+ WT_RET(__wt_vunpack_posint(pp, maxlen, xp));
+ *xp += POS_2BYTE_MAX + 1;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vunpack_int --
+ * Variable-sized packing for signed integers
+ */
+static inline int
+__wt_vunpack_int(const uint8_t **pp, size_t maxlen, int64_t *xp)
+{
+ const uint8_t *p;
+
+ WT_SIZE_CHECK(1, maxlen);
+ p = *pp;
+ switch (*p & 0xf0) {
+ case NEG_MULTI_MARKER:
+ WT_RET(__wt_vunpack_negint(pp, maxlen, (uint64_t *)xp));
+ return (0);
+ case NEG_2BYTE_MARKER:
+ case NEG_2BYTE_MARKER | 0x10:
+ WT_SIZE_CHECK(2, maxlen);
+ *xp = (int64_t)(GET_BITS(*p++, 5, 0) << 8);
+ *xp |= *p++;
+ *xp += NEG_2BYTE_MIN;
+ p += 2;
+ break;
+ case NEG_1BYTE_MARKER:
+ case NEG_1BYTE_MARKER | 0x10:
+ case NEG_1BYTE_MARKER | 0x20:
+ case NEG_1BYTE_MARKER | 0x30:
+ *xp = NEG_1BYTE_MIN + (int64_t)GET_BITS(*p, 6, 0);
+ p += 1;
+ break;
+ default:
+ /* Identical to the unsigned case. */
+ return (__wt_vunpack_uint(pp, maxlen, (uint64_t *)xp));
+ }
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __wt_vsize_posint --
+ * Return the packed size of a positive variable-length integer.
+ */
+static inline size_t
+__wt_vsize_posint(uint64_t x)
+{
+ int lz;
+
+ WT_LEADING_ZEROS(x, lz);
+ return ((size_t)(WT_INTPACK64_MAXSIZE - lz));
+}
+
+/*
+ * __wt_vsize_negint --
+ * Return the packed size of a negative variable-length integer.
+ */
+static inline size_t
+__wt_vsize_negint(uint64_t x)
+{
+ int lz;
+
+ WT_LEADING_ZEROS(~x, lz);
+ return (size_t)(WT_INTPACK64_MAXSIZE - lz);
+}
+
+/*
+ * __wt_vsize_uint --
+ * Return the packed size of an unsigned integer.
+ */
+static inline size_t
+__wt_vsize_uint(uint64_t x)
+{
+ if (x <= POS_1BYTE_MAX)
+ return (1);
+ else if (x <= POS_2BYTE_MAX) {
+ return (2);
+ } else {
+ x -= POS_2BYTE_MAX + 1;
+ return (__wt_vsize_posint(x));
+ }
+}
+
+/*
+ * __wt_vsize_int --
+ * Return the packed size of a signed integer.
+ */
+static inline size_t
+__wt_vsize_int(int64_t x)
+{
+ if (x < NEG_2BYTE_MIN) {
+ return (__wt_vsize_negint((uint64_t)x));
+ } else if (x < NEG_1BYTE_MIN) {
+ return (2);
+ } else if (x < 0) {
+ return (1);
+ } else
+ /* For non-negative values, use the unsigned code above. */
+ return (__wt_vsize_uint((uint64_t)x));
+}
diff --git a/src/third_party/wiredtiger/src/include/lint.h b/src/third_party/wiredtiger/src/include/lint.h
new file mode 100644
index 00000000000..7c0a103a8ee
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/lint.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_GCC_ATTRIBUTE(x)
+#define WT_GCC_FUNC_ATTRIBUTE(x)
+
+#define __WT_ATOMIC_ADD(v, val) \
+ ((v) += (val))
+#define __WT_ATOMIC_FETCH_ADD(v, val) \
+ ((v) += (val), (v))
+#define __WT_ATOMIC_CAS(v, old, new) \
+ ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
+#define __WT_ATOMIC_CAS_VAL(v, old, new) \
+ ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
+#define __WT_ATOMIC_STORE(v, val) \
+ ((v) = (val))
+#define __WT_ATOMIC_SUB(v, val) \
+ ((v) -= (val), (v))
+
+#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL1(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val)
+
+#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL2(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val)
+
+#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL4(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val)
+
+#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val)
+#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
+#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new)
+#define WT_ATOMIC_CAS_VAL8(v, old, new) __WT_ATOMIC_CAS_VAL(v, old, new)
+#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val)
+#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val)
+
+static inline void WT_BARRIER(void) { return; }
+static inline void WT_FULL_BARRIER(void) { return; }
+static inline void WT_PAUSE(void) { return; }
+static inline void WT_READ_BARRIER(void) { return; }
+static inline void WT_WRITE_BARRIER(void) { return; }
diff --git a/src/third_party/wiredtiger/src/include/log.h b/src/third_party/wiredtiger/src/include/log.h
new file mode 100644
index 00000000000..15054e34906
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/log.h
@@ -0,0 +1,177 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_LOG_FILENAME "WiredTigerLog" /* Log file name */
+
+/* Logging subsystem declarations. */
+#define LOG_ALIGN 128
+#define WT_LOG_SLOT_BUF_INIT_SIZE 64 * 1024
+
+#define INIT_LSN(l) do { \
+ (l)->file = 1; \
+ (l)->offset = 0; \
+} while (0)
+
+#define IS_INIT_LSN(l) ((l)->file == 1 && (l)->offset == 0)
+
+/*
+ * Both of the macros below need to change if the content of __wt_lsn
+ * ever changes. The value is the following:
+ * txnid, record type, operation type, file id, operation key, operation value
+ */
+#define LOGC_KEY_FORMAT WT_UNCHECKED_STRING(IqI)
+#define LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu)
+
+#define LOG_SKIP_HEADER(data) \
+ ((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record))
+#define LOG_REC_SIZE(size) \
+ ((size) - offsetof(WT_LOG_RECORD, record))
+
+#define MAX_LSN(l) do { \
+ (l)->file = UINT32_MAX; \
+ (l)->offset = INT64_MAX; \
+} while (0)
+
+/*
+ * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
+ * and 1 if lsn0 > lsn1.
+ */
+#define LOG_CMP(lsn1, lsn2) \
+ ((lsn1)->file != (lsn2)->file ? \
+ ((lsn1)->file < (lsn2)->file ? -1 : 1) : \
+ ((lsn1)->offset != (lsn2)->offset ? \
+ ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0))
+
+/*
+ * Possible values for the consolidation array slot states:
+ * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
+ * WT_LOG_SLOT_DONE - all activity on this slot is complete.
+ * WT_LOG_SLOT_FREE - slot is available for allocation.
+ * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
+ * WT_LOG_SLOT_READY - slot is ready for threads to join.
+ * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
+ */
+#define WT_LOG_SLOT_DONE 0
+#define WT_LOG_SLOT_FREE 1
+#define WT_LOG_SLOT_PENDING 2
+#define WT_LOG_SLOT_READY 3
+typedef struct {
+ int64_t slot_state; /* Slot state */
+ uint64_t slot_group_size; /* Group size */
+ int32_t slot_error; /* Error value */
+#define SLOT_INVALID_INDEX 0xffffffff
+ uint32_t slot_index; /* Active slot index */
+ wt_off_t slot_start_offset; /* Starting file offset */
+ WT_LSN slot_release_lsn; /* Slot release LSN */
+ WT_LSN slot_start_lsn; /* Slot starting LSN */
+ WT_LSN slot_end_lsn; /* Slot ending LSN */
+ WT_FH *slot_fh; /* File handle for this group */
+ WT_ITEM slot_buf; /* Buffer for grouped writes */
+ int32_t slot_churn; /* Active slots are scarce. */
+
+#define SLOT_BUF_GROW 0x01 /* Grow buffer on release */
+#define SLOT_BUFFERED 0x02 /* Buffer writes */
+#define SLOT_CLOSEFH 0x04 /* Close old fh on release */
+#define SLOT_SYNC 0x08 /* Needs sync on release */
+ uint32_t flags; /* Flags */
+} WT_LOGSLOT WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+typedef struct {
+ WT_LOGSLOT *slot;
+ wt_off_t offset;
+} WT_MYSLOT;
+
+ /* Offset of first record */
+#define LOG_FIRST_RECORD log->allocsize
+
+typedef struct {
+ uint32_t allocsize; /* Allocation alignment size */
+ wt_off_t log_written; /* Amount of log written this period */
+ /*
+ * Log file information
+ */
+ uint32_t fileid; /* Current log file number */
+ WT_FH *log_fh; /* Logging file handle */
+ WT_FH *log_close_fh; /* Logging file handle to close */
+
+ /*
+ * System LSNs
+ */
+ WT_LSN alloc_lsn; /* Next LSN for allocation */
+ WT_LSN ckpt_lsn; /* Last checkpoint LSN */
+ WT_LSN first_lsn; /* First LSN */
+ WT_LSN sync_lsn; /* LSN of the last sync */
+ WT_LSN trunc_lsn; /* End LSN for recovery truncation */
+ WT_LSN write_lsn; /* Last LSN written to log file */
+
+ /*
+ * Synchronization resources
+ */
+ WT_SPINLOCK log_lock; /* Locked: Logging fields */
+ WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */
+ WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */
+
+ WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
+
+ /* Notify any waiting threads when sync_lsn is updated. */
+ WT_CONDVAR *log_sync_cond;
+
+ /*
+ * Consolidation array information
+ * SLOT_ACTIVE must be less than SLOT_POOL.
+ * Our testing shows that the more consolidation we generate the
+ * better the performance we see which equates to an active slot
+ * slot count of one.
+ */
+#define SLOT_ACTIVE 1
+#define SLOT_POOL 16
+ uint32_t pool_index; /* Global pool index */
+ WT_LOGSLOT *slot_array[SLOT_ACTIVE]; /* Active slots */
+ WT_LOGSLOT slot_pool[SLOT_POOL]; /* Pool of all slots */
+
+#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */
+ uint32_t flags;
+} WT_LOG;
+
+typedef struct {
+ uint32_t len; /* 00-03: Record length including hdr */
+ uint32_t checksum; /* 04-07: Checksum of the record */
+ uint8_t unused[8]; /* 08-15: Padding */
+ uint8_t record[0]; /* Beginning of actual data */
+} WT_LOG_RECORD;
+
+/*
+ * WT_LOG_DESC --
+ * The log file's description.
+ */
+struct __wt_log_desc {
+#define WT_LOG_MAGIC 0x101064
+ uint32_t log_magic; /* 00-03: Magic number */
+#define WT_LOG_MAJOR_VERSION 1
+ uint16_t majorv; /* 04-05: Major version */
+#define WT_LOG_MINOR_VERSION 0
+ uint16_t minorv; /* 06-07: Minor version */
+ uint64_t log_size; /* 08-15: Log file size */
+};
+
+/*
+ * WT_LOG_REC_DESC --
+ * A descriptor for a log record type.
+ */
+struct __wt_log_rec_desc {
+ const char *fmt;
+ int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
+
+/*
+ * WT_LOG_OP_DESC --
+ * A descriptor for a log operation type.
+ */
+struct __wt_log_op_desc {
+ const char *fmt;
+ int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
+};
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
new file mode 100644
index 00000000000..99532b97850
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_LSM_WORKER_COOKIE --
+ * State for an LSM worker thread.
+ */
+struct __wt_lsm_worker_cookie {
+ WT_LSM_CHUNK **chunk_array;
+ size_t chunk_alloc;
+ u_int nchunks;
+};
+
+/*
+ * WT_LSM_WORKER_ARGS --
+ * State for an LSM worker thread.
+ */
+struct __wt_lsm_worker_args {
+ WT_SESSION_IMPL *session; /* Session */
+ WT_CONDVAR *work_cond; /* Owned by the manager */
+ wt_thread_t tid; /* Thread id */
+ u_int id; /* My manager slot id */
+ uint32_t type; /* Types of operations handled */
+#define WT_LSM_WORKER_RUN 0x01
+ uint32_t flags; /* Worker flags */
+};
+
+/*
+ * WT_CURSOR_LSM --
+ * An LSM cursor.
+ */
+struct __wt_cursor_lsm {
+ WT_CURSOR iface;
+
+ WT_LSM_TREE *lsm_tree;
+ uint64_t dsk_gen;
+
+ u_int nchunks; /* Number of chunks in the cursor */
+ u_int nupdates; /* Updates needed (including
+ snapshot isolation checks). */
+ WT_BLOOM **blooms; /* Bloom filter handles. */
+ size_t bloom_alloc;
+
+ WT_CURSOR **cursors; /* Cursor handles. */
+ size_t cursor_alloc;
+
+ WT_CURSOR *current; /* The current cursor for iteration */
+ WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */
+
+ uint64_t *switch_txn; /* Switch txn for each chunk */
+ size_t txnid_alloc;
+
+ u_int update_count; /* Updates performed. */
+
+#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */
+#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */
+#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */
+#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */
+#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */
+#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the
+ current key */
+#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */
+#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */
+ uint32_t flags;
+};
+
+/*
+ * WT_LSM_CHUNK --
+ * A single chunk (file) in an LSM tree.
+ */
+struct __wt_lsm_chunk {
+ const char *uri; /* Data source for this chunk */
+ const char *bloom_uri; /* URI of Bloom filter, if any */
+ struct timespec create_ts; /* Creation time (for rate limiting) */
+ uint64_t count; /* Approximate count of records */
+ uint64_t size; /* Final chunk size */
+
+ uint64_t switch_txn; /*
+ * Largest transaction that can write
+ * to this chunk, set by a worker
+ * thread when the chunk is switched
+ * out, or by compact to get the most
+ * recent chunk flushed.
+ */
+
+ uint32_t id; /* ID used to generate URIs */
+ uint32_t generation; /* Merge generation */
+ uint32_t refcnt; /* Number of worker thread references */
+ uint32_t bloom_busy; /* Number of worker thread references */
+
+ int8_t empty; /* 1/0: checkpoint missing */
+ int8_t evicted; /* 1/0: in-memory chunk was evicted */
+
+#define WT_LSM_CHUNK_BLOOM 0x01
+#define WT_LSM_CHUNK_MERGING 0x02
+#define WT_LSM_CHUNK_ONDISK 0x04
+#define WT_LSM_CHUNK_STABLE 0x08
+ uint32_t flags;
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+/*
+ * Different types of work units. Used by LSM worker threads to choose which
+ * type of work they will execute, and by work units to define which action
+ * is required.
+ */
+#define WT_LSM_WORK_BLOOM 0x01 /* Create a bloom filter */
+#define WT_LSM_WORK_DROP 0x02 /* Drop unused chunks */
+#define WT_LSM_WORK_FLUSH 0x04 /* Flush a chunk to disk */
+#define WT_LSM_WORK_MERGE 0x08 /* Look for a tree merge */
+#define WT_LSM_WORK_SWITCH 0x10 /* Switch to new in-memory chunk */
+
+/*
+ * WT_LSM_WORK_UNIT --
+ * A definition of maintenance that an LSM tree needs done.
+ */
+struct __wt_lsm_work_unit {
+ TAILQ_ENTRY(__wt_lsm_work_unit) q; /* Worker unit queue */
+ uint32_t type; /* Type of operation */
+#define WT_LSM_WORK_FORCE 0x0001 /* Force operation */
+ uint32_t flags; /* Flags for operation */
+ WT_LSM_TREE *lsm_tree;
+};
+
+/*
+ * WT_LSM_MANAGER --
+ * A structure that holds resources used to manage any LSM trees in a
+ * database.
+ */
+struct __wt_lsm_manager {
+ /*
+ * Queues of work units for LSM worker threads. We maintain three
+ * queues, to allow us to keep each queue FIFO, rather than needing
+ * to manage the order of work by shuffling the queue order.
+ * One queue for switches - since switches should never wait for other
+ * work to be done.
+ * One queue for application requested work. For example flushing
+ * and creating bloom filters.
+ * One queue that is for longer running operations such as merges.
+ */
+ TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit) switchqh;
+ TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit) appqh;
+ TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh;
+ WT_SPINLOCK switch_lock; /* Lock for switch queue */
+ WT_SPINLOCK app_lock; /* Lock for application queue */
+ WT_SPINLOCK manager_lock; /* Lock for manager queue */
+ WT_CONDVAR *work_cond; /* Used to notify worker of activity */
+ uint32_t lsm_workers; /* Current number of LSM workers */
+ uint32_t lsm_workers_max;
+#define WT_LSM_MAX_WORKERS 20
+ WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS];
+};
+
+/*
+ * WT_LSM_TREE --
+ * An LSM tree.
+ */
+struct __wt_lsm_tree {
+ const char *name, *config, *filename;
+ const char *key_format, *value_format;
+ const char *bloom_config, *file_config;
+
+ WT_COLLATOR *collator;
+ const char *collator_name;
+
+ int refcnt; /* Number of users of the tree */
+#define LSM_TREE_MAX_QUEUE 100
+ int queue_ref;
+ WT_RWLOCK *rwlock;
+ TAILQ_ENTRY(__wt_lsm_tree) q;
+
+ WT_DSRC_STATS stats; /* LSM-level statistics */
+
+ uint64_t dsk_gen;
+
+ long ckpt_throttle; /* Rate limiting due to checkpoints */
+ long merge_throttle; /* Rate limiting due to merges */
+ uint64_t chunk_fill_ms; /* Estimate of time to fill a chunk */
+ struct timespec last_flush_ts; /* Timestamp last flush finished */
+ struct timespec work_push_ts; /* Timestamp last work unit added */
+ uint64_t merge_progressing; /* Bumped when merges are active */
+ uint32_t merge_syncing; /* Bumped when merges are syncing */
+
+ /* Configuration parameters */
+ uint32_t bloom_bit_count;
+ uint32_t bloom_hash_count;
+ uint64_t chunk_size;
+ uint64_t chunk_max;
+ u_int merge_min, merge_max;
+
+ u_int merge_idle; /* Count of idle merge threads */
+
+#define WT_LSM_BLOOM_MERGED 0x00000001
+#define WT_LSM_BLOOM_OFF 0x00000002
+#define WT_LSM_BLOOM_OLDEST 0x00000004
+ uint32_t bloom; /* Bloom creation policy */
+
+ WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */
+ size_t chunk_alloc; /* Space allocated for chunks */
+ u_int nchunks; /* Number of active chunks */
+ uint32_t last; /* Last allocated ID */
+ int modified; /* Have there been updates? */
+
+ WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */
+ size_t old_alloc; /* Space allocated for old chunks */
+ u_int nold_chunks; /* Number of old chunks */
+ int freeing_old_chunks; /* Whether chunks are being freed */
+ uint32_t merge_aggressiveness; /* Increase amount of work per merge */
+
+#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */
+#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */
+#define WT_LSM_TREE_NEED_SWITCH 0x04 /* New chunk needs creating */
+#define WT_LSM_TREE_OPEN 0x08 /* The tree is open */
+#define WT_LSM_TREE_THROTTLE 0x10 /* Throttle updates */
+ uint32_t flags;
+
+#define WT_LSM_TREE_EXCLUSIVE 0x01 /* Tree is opened exclusively */
+ uint8_t flags_atomic;
+};
+
+/*
+ * WT_LSM_DATA_SOURCE --
+ * Implementation of the WT_DATA_SOURCE interface for LSM.
+ */
+struct __wt_lsm_data_source {
+ WT_DATA_SOURCE iface;
+
+ WT_RWLOCK *rwlock;
+};
diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h
new file mode 100644
index 00000000000..e4d7fd64f94
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/meta.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_WIREDTIGER "WiredTiger" /* Version file */
+#define WT_SINGLETHREAD "WiredTiger.lock" /* Locking file */
+
+#define WT_BASECONFIG "WiredTiger.basecfg" /* Configuration */
+#define WT_USERCONFIG "WiredTiger.config" /* Configuration */
+
+#define WT_METADATA_BACKUP "WiredTiger.backup" /* Hot backup file */
+
+#define WT_METADATA_TURTLE "WiredTiger.turtle" /* Metadata metadata */
+#define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */
+
+#define WT_METADATA_URI "metadata:" /* Metadata alias */
+#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */
+#define WT_IS_METADATA(dh) \
+ (strcmp((dh)->name, WT_METAFILE_URI) == 0)
+#define WT_METAFILE_ID 0 /* Metadata file ID */
+
+#define WT_METADATA_VERSION "WiredTiger version" /* Version keys */
+#define WT_METADATA_VERSION_STR "WiredTiger version string"
+
+/*
+ * WT_CKPT --
+ * Encapsulation of checkpoint information, shared by the metadata, the
+ * btree engine, and the block manager.
+ */
+#define WT_CHECKPOINT "WiredTigerCheckpoint"
+#define WT_CKPT_FOREACH(ckptbase, ckpt) \
+ for ((ckpt) = (ckptbase); (ckpt)->name != NULL; ++(ckpt))
+
+struct __wt_ckpt {
+ char *name; /* Name or NULL */
+
+ WT_ITEM addr; /* Checkpoint cookie string */
+ WT_ITEM raw; /* Checkpoint cookie raw */
+
+ int64_t order; /* Checkpoint order */
+
+ uintmax_t sec; /* Timestamp */
+
+ uint64_t ckpt_size; /* Checkpoint size */
+
+ uint64_t write_gen; /* Write generation */
+
+ void *bpriv; /* Block manager private */
+
+#define WT_CKPT_ADD 0x01 /* Checkpoint to be added */
+#define WT_CKPT_DELETE 0x02 /* Checkpoint to be deleted */
+#define WT_CKPT_FAKE 0x04 /* Checkpoint is a fake */
+#define WT_CKPT_UPDATE 0x08 /* Checkpoint requires update */
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h
new file mode 100644
index 00000000000..bf2c4ccb8cf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/misc.h
@@ -0,0 +1,221 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Quiet compiler warnings about unused function parameters and variables,
+ * and unused function return values.
+ */
+#define WT_UNUSED(var) (void)(var)
+
+/* Basic constants. */
+#define WT_MILLION (1000000)
+#define WT_BILLION (1000000000)
+
+#define WT_KILOBYTE (1024)
+#define WT_MEGABYTE (1048576)
+#define WT_GIGABYTE (1073741824)
+#define WT_TERABYTE ((uint64_t)1099511627776)
+#define WT_PETABYTE ((uint64_t)1125899906842624)
+
+/*
+ * Number of directory entries can grow dynamically.
+ */
+#define WT_DIR_ENTRY 32
+
+#define WT_DIRLIST_EXCLUDE 0x1 /* Exclude files matching prefix */
+#define WT_DIRLIST_INCLUDE 0x2 /* Include files matching prefix */
+
+/*
+ * Sizes that cannot be larger than 2**32 are stored in uint32_t fields in
+ * common structures to save space. To minimize conversions from size_t to
+ * uint32_t through the code, we use the following macros.
+ */
+#define WT_STORE_SIZE(s) ((uint32_t)(s))
+#define WT_PTRDIFF(end, begin) \
+ ((size_t)((uint8_t *)(end) - (uint8_t *)(begin)))
+#define WT_PTRDIFF32(end, begin) \
+ WT_STORE_SIZE(WT_PTRDIFF((end), (begin)))
+#define WT_BLOCK_FITS(p, len, begin, maxlen) \
+ ((uint8_t *)(p) >= (uint8_t *)(begin) && \
+ ((uint8_t *)(p) + (len) <= (uint8_t *)(begin) + (maxlen)))
+#define WT_PTR_IN_RANGE(p, begin, maxlen) \
+ WT_BLOCK_FITS((p), 1, (begin), (maxlen))
+
+/*
+ * Align an unsigned value of any type to a specified power-of-2, including the
+ * offset result of a pointer subtraction; do the calculation using the largest
+ * unsigned integer type available.
+ */
+#define WT_ALIGN(n, v) \
+ ((((uintmax_t)(n)) + ((v) - 1)) & ~(((uintmax_t)(v)) - 1))
+
+/* Min, max. */
+#define WT_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define WT_MAX(a, b) ((a) < (b) ? (b) : (a))
+
+/* Elements in an array. */
+#define WT_ELEMENTS(a) (sizeof(a) / sizeof(a[0]))
+
+/* 10 level skip lists, 1/4 have a link to the next element. */
+#define WT_SKIP_MAXDEPTH 10
+#define WT_SKIP_PROBABILITY (UINT32_MAX >> 2)
+
+/*
+ * __wt_calloc_def --
+ * Simple calls don't need separate sizeof arguments.
+ */
+#define __wt_calloc_def(session, number, addr) \
+ __wt_calloc(session, (size_t)(number), sizeof(**(addr)), addr)
+
+/*
+ * __wt_realloc_def --
+ * Common case allocate-and-grow function.
+ * Starts by allocating the requested number of items (at least 10), then
+ * doubles each time the list needs to grow.
+ */
+#define __wt_realloc_def(session, sizep, number, addr) \
+ (((number) * sizeof(**(addr)) <= *(sizep)) ? 0 : \
+ __wt_realloc(session, sizep, WT_MAX(*(sizep) * 2, \
+ WT_MAX(10, (number)) * sizeof(**(addr))), addr))
+/*
+ * Our internal free function clears the underlying address atomically so there
+ * is a smaller chance of racing threads seeing intermediate results while a
+ * structure is being free'd. (That would be a bug, of course, but I'd rather
+ * not drop core, just the same.) That's a non-standard "free" API, and the
+ * resulting bug is a mother to find -- make sure we get it right, don't make
+ * the caller remember to put the & operator on the pointer.
+ */
+#define __wt_free(session, p) do { \
+ if ((p) != NULL) \
+ __wt_free_int(session, (void *)&(p)); \
+} while (0)
+#ifdef HAVE_DIAGNOSTIC
+#define __wt_overwrite_and_free(session, p) do { \
+ memset(p, WT_DEBUG_BYTE, sizeof(*(p))); \
+ __wt_free(session, p); \
+} while (0)
+#define __wt_overwrite_and_free_len(session, p, len) do { \
+ memset(p, WT_DEBUG_BYTE, len); \
+ __wt_free(session, p); \
+} while (0)
+#else
+#define __wt_overwrite_and_free(session, p) __wt_free(session, p)
+#define __wt_overwrite_and_free_len(session, p, len) __wt_free(session, p)
+#endif
+
+/*
+ * Flag set, clear and test.
+ *
+ * They come in 3 flavors: F_XXX (handles a field named "flags" in the structure
+ * referenced by its argument), LF_XXX (handles a local variable named "flags"),
+ * and FLD_XXX (handles any variable, anywhere).
+ *
+ * Flags are unsigned 32-bit values -- we cast to keep the compiler quiet (the
+ * hex constant might be a negative integer), and to ensure the hex constant is
+ * the correct size before applying the bitwise not operator.
+ */
+#define F_CLR(p, mask) ((p)->flags &= ~((uint32_t)(mask)))
+#define F_ISSET(p, mask) ((p)->flags & ((uint32_t)(mask)))
+#define F_SET(p, mask) ((p)->flags |= ((uint32_t)(mask)))
+
+#define LF_CLR(mask) ((flags) &= ~((uint32_t)(mask)))
+#define LF_ISSET(mask) ((flags) & ((uint32_t)(mask)))
+#define LF_SET(mask) ((flags) |= ((uint32_t)(mask)))
+
+#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask)))
+#define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask)))
+#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask)))
+
+/* Verbose messages. */
+#ifdef HAVE_VERBOSE
+#define WT_VERBOSE_ISSET(session, f) \
+ (FLD_ISSET(S2C(session)->verbose, f))
+#else
+#define WT_VERBOSE_ISSET(session, f) 0
+#endif
+
+/*
+ * Clear a structure, two flavors: inline when we want to guarantee there's
+ * no function call or setup/tear-down of a loop, and the default where the
+ * compiler presumably chooses. Gcc 4.3 is supposed to get this right, but
+ * we've seen problems when calling memset to clear structures in performance
+ * critical paths.
+ */
+#define WT_CLEAR_INLINE(type, s) do { \
+ static const type __clear; \
+ s = __clear; \
+} while (0)
+#define WT_CLEAR(s) \
+ memset(&(s), 0, sizeof(s))
+
+/* Check if a string matches a prefix. */
+#define WT_PREFIX_MATCH(str, pfx) \
+ (((const char *)str)[0] == ((const char *)pfx)[0] && \
+ strncmp((str), (pfx), strlen(pfx)) == 0)
+
+/* Check if a non-nul-terminated string matches a prefix. */
+#define WT_PREFIX_MATCH_LEN(str, len, pfx) \
+ ((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx))
+
+/* Check if a string matches a prefix, and move past it. */
+#define WT_PREFIX_SKIP(str, pfx) \
+ (WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0)
+
+/*
+ * Check if a variable string equals a constant string. Inline the common
+ * case for WiredTiger of a single byte string. This is required because not
+ * all compilers optimize this case in strcmp (e.g., clang).
+ */
+#define WT_STREQ(s, cs) \
+ (sizeof(cs) == 2 ? (s)[0] == (cs)[0] && (s)[1] == '\0' : \
+ strcmp(s, cs) == 0)
+
+/* Check if a string matches a byte string of len bytes. */
+#define WT_STRING_MATCH(str, bytes, len) \
+ (((const char *)str)[0] == ((const char *)bytes)[0] && \
+ strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
+
+/*
+ * Macro that produces a string literal that isn't wrapped in quotes, to avoid
+ * tripping up spell checkers.
+ */
+#define WT_UNCHECKED_STRING(str) #str
+
+/* Function return value and scratch buffer declaration and initialization. */
+#define WT_DECL_ITEM(i) WT_ITEM *i = NULL
+#define WT_DECL_RET int ret = 0
+
+/* If a WT_ITEM data field points somewhere in its allocated memory. */
+#define WT_DATA_IN_ITEM(i) \
+ ((i)->mem != NULL && (i)->data >= (i)->mem && \
+ WT_PTRDIFF((i)->data, (i)->mem) < (i)->memsize)
+
+/* Copy the data and size fields of an item. */
+#define WT_ITEM_SET(dst, src) do { \
+ (dst).data = (src).data; \
+ (dst).size = (src).size; \
+} while (0)
+
+/*
+ * In diagnostic mode we track the locations from which hazard pointers and
+ * scratch buffers were acquired.
+ */
+#ifdef HAVE_DIAGNOSTIC
+#define __wt_scr_alloc(session, size, scratchp) \
+ __wt_scr_alloc_func(session, size, scratchp, __FILE__, __LINE__)
+#define __wt_page_in(session, ref, flags) \
+ __wt_page_in_func(session, ref, flags, __FILE__, __LINE__)
+#define __wt_page_swap(session, held, want, flags) \
+ __wt_page_swap_func(session, held, want, flags, __FILE__, __LINE__)
+#else
+#define __wt_scr_alloc(session, size, scratchp) \
+ __wt_scr_alloc_func(session, size, scratchp)
+#define __wt_page_in(session, ref, flags) \
+ __wt_page_in_func(session, ref, flags)
+#define __wt_page_swap(session, held, want, flags) \
+ __wt_page_swap_func(session, held, want, flags)
+#endif
diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i
new file mode 100644
index 00000000000..73caed09c8c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/misc.i
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_verbose --
+ * Verbose message.
+ */
+static inline int
+__wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+#ifdef HAVE_VERBOSE
+ WT_DECL_RET;
+ va_list ap;
+
+ if (WT_VERBOSE_ISSET(session, flag)) {
+ va_start(ap, fmt);
+ ret = __wt_eventv(session, 1, 0, NULL, 0, fmt, ap);
+ va_end(ap);
+ }
+ return (ret);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(fmt);
+ WT_UNUSED(flag);
+ return (0);
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/include/msvc.h b/src/third_party/wiredtiger/src/include/msvc.h
new file mode 100644
index 00000000000..8f44a329940
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/msvc.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+#include <intrin.h>
+
+#ifndef _M_AMD64
+#error "Only x64 is supported with MSVC"
+#endif
+
+#define inline __inline
+
+#define WT_GCC_ATTRIBUTE(x)
+#define WT_GCC_FUNC_ATTRIBUTE(x)
+
+#define __WT_ATOMIC_ADD(v, val, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val))
+#define __WT_ATOMIC_CAS(v, old, new, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedCompareExchange ## s \
+ ((t*)&(v), (t)(new), (t)(old)) == (t)(old))
+#define __WT_ATOMIC_CAS_VAL(v, old, new, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedCompareExchange ## s((t*)&(v), (t)(new), (t)(old)))
+#define __WT_ATOMIC_STORE(v, val, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedExchange ## s((t*)&(v), (t)(val)))
+#define __WT_ATOMIC_SUB(v, val, n, s, t) \
+ (WT_STATIC_ASSERT(sizeof(v) == (n)), \
+ _InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val))
+
+#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1, 8, char)
+#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1, 8, char)
+#define WT_ATOMIC_CAS_VAL1(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 1, 8, char)
+#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1, 8, char)
+#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1, 8, char)
+
+#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2, 16, short)
+#define WT_ATOMIC_CAS2(v, old, new) \
+ __WT_ATOMIC_CAS(v, old, new, 2, 16, short)
+#define WT_ATOMIC_CAS_VAL2(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 2, 16, short)
+#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2, 16, short)
+#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2, 16, short)
+
+#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4, , long)
+#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4, , long)
+#define WT_ATOMIC_CAS_VAL4(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 4, , long)
+#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4, , long)
+#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4, , long)
+
+#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8, 64, __int64)
+#define WT_ATOMIC_CAS8(v, old, new) \
+ __WT_ATOMIC_CAS(v, old, new, 8, 64, __int64)
+#define WT_ATOMIC_CAS_VAL8(v, old, new) \
+ __WT_ATOMIC_CAS_VAL(v, old, new, 8, 64, __int64)
+#define WT_ATOMIC_STORE8(v, val) \
+ __WT_ATOMIC_STORE(v, val, 8, 64, __int64)
+#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8, 64, __int64)
+
+static inline void WT_BARRIER(void) { _ReadWriteBarrier(); }
+static inline void WT_FULL_BARRIER(void) { _mm_mfence(); }
+static inline void WT_PAUSE(void) { _mm_pause(); }
+static inline void WT_READ_BARRIER(void) { _mm_lfence(); }
+static inline void WT_WRITE_BARRIER(void) { _mm_sfence(); }
diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h
new file mode 100644
index 00000000000..b71496dd595
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/mutex.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Condition variables:
+ *
+ * WiredTiger uses condition variables to signal between threads, and for
+ * locking operations that are expected to block.
+ */
+struct __wt_condvar {
+ const char *name; /* Mutex name for debugging */
+
+ wt_mutex_t mtx; /* Mutex */
+ wt_cond_t cond; /* Condition variable */
+
+ int waiters; /* Numbers of waiters, or
+ -1 if signalled with no waiters. */
+};
+
+/*
+ * Read/write locks:
+ *
+ * WiredTiger uses read/write locks for shared/exclusive access to resources.
+ */
+struct __wt_rwlock {
+ const char *name; /* Lock name for debugging */
+
+ wt_rwlock_t rwlock; /* Read/write lock */
+};
+
+/*
+ * Spin locks:
+ *
+ * WiredTiger uses spinlocks for fast mutual exclusion (where operations done
+ * while holding the spin lock are expected to complete in a small number of
+ * instructions).
+ */
+#define SPINLOCK_GCC 0
+#define SPINLOCK_PTHREAD_MUTEX 1
+#define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 2
+#define SPINLOCK_PTHREAD_MUTEX_LOGGING 3
+#define SPINLOCK_MSVC 4
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
+
+typedef volatile int
+ WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
+ SPINLOCK_TYPE == SPINLOCK_MSVC ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+typedef struct {
+ wt_mutex_t lock;
+
+ uint64_t counter; /* Statistics: counter */
+
+ const char *name; /* Statistics: mutex name */
+ int8_t id; /* Statistics: current holder ID */
+
+ int8_t initialized; /* Lock initialized, for cleanup */
+} WT_SPINLOCK WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+#else
+
+#error Unknown spinlock type
+
+#endif
diff --git a/src/third_party/wiredtiger/src/include/mutex.i b/src/third_party/wiredtiger/src/include/mutex.i
new file mode 100644
index 00000000000..0d5a8586051
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/mutex.i
@@ -0,0 +1,368 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Spin locks:
+ *
+ * These used for cases where fast mutual exclusion is needed (where operations
+ * done while holding the spin lock are expected to complete in a small number
+ * of instructions.
+ */
+
+#if SPINLOCK_TYPE == SPINLOCK_GCC
+
+#define WT_DECL_SPINLOCK_ID(i)
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock)
+
+/* Default to spinning 1000 times before yielding. */
+#ifndef WT_SPIN_COUNT
+#define WT_SPIN_COUNT 1000
+#endif
+
+/*
+ * __wt_spin_init --
+ * Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+
+ *(t) = 0;
+ return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ * Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ *(t) = 0;
+}
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY);
+}
+
+/*
+ * __wt_spin_lock --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ int i;
+
+ WT_UNUSED(session);
+
+ while (__sync_lock_test_and_set(t, 1)) {
+ for (i = 0; *t && i < WT_SPIN_COUNT; i++)
+ WT_PAUSE();
+ if (*t)
+ __wt_yield();
+ }
+}
+
+/*
+ * __wt_spin_unlock --
+ * Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ __sync_lock_release(t);
+}
+
+#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * __wt_spin_init --
+ * Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+ pthread_mutexattr_t attr;
+
+ WT_RET(pthread_mutexattr_init(&attr));
+ WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP));
+ WT_RET(pthread_mutex_init(&t->lock, &attr));
+#else
+ WT_RET(pthread_mutex_init(&t->lock, NULL));
+#endif
+
+ t->name = name;
+ t->initialized = 1;
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ WT_RET(__wt_spin_lock_register_lock(session, t));
+#endif
+
+ WT_UNUSED(session);
+ return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ * Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+ __wt_spin_lock_unregister_lock(session, t);
+#endif
+ if (t->initialized) {
+ (void)pthread_mutex_destroy(&t->lock);
+ t->initialized = 0;
+ }
+}
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
+ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE
+
+#define WT_DECL_SPINLOCK_ID(i)
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock)
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ return (pthread_mutex_trylock(&t->lock));
+}
+
+/*
+ * __wt_spin_lock --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ pthread_mutex_lock(&t->lock);
+}
+
+#endif
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * When logging statistics, we track which spinlocks block and why.
+ */
+#define WT_DECL_SPINLOCK_ID(i) \
+ static int i = WT_SPINLOCK_REGISTER
+#define WT_SPINLOCK_REGISTER -1
+#define WT_SPINLOCK_REGISTER_FAILED -2
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock, idp, __FILE__, __LINE__)
+#define __wt_spin_lock(session, lock) do { \
+ WT_DECL_SPINLOCK_ID(__id); \
+ __wt_spin_lock_func(session, lock, &__id, __FILE__, __LINE__); \
+} while (0)
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session,
+ WT_SPINLOCK *t, int *idp, const char *file, int line)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C_SAFE(session);
+ /* If we're not maintaining statistics, it's simple. */
+ if (session == NULL || !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST))
+ return (pthread_mutex_trylock(&t->lock));
+
+ /*
+ * If this caller hasn't yet registered, do so. The caller's location
+ * ID is a static offset into a per-connection structure, and that has
+ * problems: first, if there are multiple connections, we'll need to
+ * hold some kind of lock to avoid racing when setting that value, and
+ * second, if/when there are multiple connections and/or a single
+ * connection is closed and re-opened, the variable may be initialized
+ * and the underlying connection information may not. Check both.
+ */
+ if (*idp == WT_SPINLOCK_REGISTER ||
+ conn->spinlock_block[*idp].name == NULL)
+ WT_RET(__wt_spin_lock_register_caller(
+ session, t->name, file, line, idp));
+
+ /*
+ * Try to acquire the mutex: on failure, update blocking statistics, on
+ * success, set our ID as the mutex holder.
+ *
+ * Note the race between acquiring the lock and setting our ID as the
+ * holder, this can appear in the output as mutexes blocking in ways
+ * that can't actually happen (although still an indicator of a mutex
+ * that's busier than we'd like).
+ */
+ if ((ret = pthread_mutex_trylock(&t->lock)) == 0)
+ t->id = *idp;
+ else
+ if (*idp >= 0) {
+ ++conn->spinlock_block[*idp].total;
+ if (t->id >= 0)
+ ++conn->spinlock_block[*idp].blocked[t->id];
+ }
+
+ /* Update the mutex counter and flush to minimize the windows. */
+ ++t->counter;
+ WT_FULL_BARRIER();
+ return (ret);
+}
+
+/*
+ * __wt_spin_lock_func --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock_func(WT_SESSION_IMPL *session,
+ WT_SPINLOCK *t, int *idp, const char *file, int line)
+{
+ /* If we're not maintaining statistics, it's simple. */
+ if (session == NULL ||
+ !FLD_ISSET(conn->stat_flags, WT_STAT_CONN_FAST)) {
+ pthread_mutex_lock(&t->lock);
+ return;
+ }
+
+ /* Try to acquire the mutex. */
+ if (__wt_spin_trylock_func(session, t, idp, file, line) == 0)
+ return;
+
+ /*
+ * On failure, wait on the mutex; once acquired, set our ID as the
+ * holder and flush to minimize the windows.
+ */
+ pthread_mutex_lock(&t->lock);
+ t->id = *idp;
+ WT_FULL_BARRIER();
+}
+
+#endif
+
+/*
+ * __wt_spin_unlock --
+ * Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ pthread_mutex_unlock(&t->lock);
+}
+
+#elif SPINLOCK_TYPE == SPINLOCK_MSVC
+
+#define WT_DECL_SPINLOCK_ID(i)
+#define WT_SPINLOCK_REGISTER -1
+#define WT_SPINLOCK_REGISTER_FAILED -2
+
+#define __wt_spin_trylock(session, lock, idp) \
+ __wt_spin_trylock_func(session, lock)
+
+/*
+ * __wt_spin_init --
+ * Initialize a spinlock.
+ */
+static inline int
+__wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+
+ InitializeCriticalSectionAndSpinCount(&t->lock, 4000);
+
+ return (0);
+}
+
+/*
+ * __wt_spin_destroy --
+ * Destroy a spinlock.
+ */
+static inline void
+__wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ DeleteCriticalSection(&t->lock);
+}
+
+/*
+ * __wt_spin_trylock_func --
+ * Try to lock a spinlock or fail immediately if it is busy.
+ */
+static inline int
+__wt_spin_trylock_func(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ BOOL b = TryEnterCriticalSection(&t->lock);
+ return (b == 0 ? EBUSY : 0);
+}
+
+/*
+ * __wt_spin_lock --
+ * Spin until the lock is acquired.
+ */
+static inline void
+__wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ EnterCriticalSection(&t->lock);
+}
+
+/*
+ * __wt_spin_unlock --
+ * Release the spinlock.
+ */
+static inline void
+__wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_UNUSED(session);
+
+ LeaveCriticalSection(&t->lock);
+}
+
+#else
+
+#error Unknown spinlock type
+
+#endif
diff --git a/src/third_party/wiredtiger/src/include/os.h b/src/third_party/wiredtiger/src/include/os.h
new file mode 100644
index 00000000000..846249294fe
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/os.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_SYSCALL_RETRY(call, ret) do { \
+ int __retry; \
+ for (__retry = 0; __retry < 10; ++__retry) { \
+ if ((call) == 0) { \
+ (ret) = 0; \
+ break; \
+ } \
+ switch ((ret) = __wt_errno()) { \
+ case 0: \
+ /* The call failed but didn't set errno. */ \
+ (ret) = WT_ERROR; \
+ break; \
+ case EAGAIN: \
+ case EBUSY: \
+ case EINTR: \
+ case EIO: \
+ case EMFILE: \
+ case ENFILE: \
+ case ENOSPC: \
+ __wt_sleep(0L, 500000L); \
+ continue; \
+ default: \
+ break; \
+ } \
+ break; \
+ } \
+} while (0)
+
+#define WT_TIMEDIFF(end, begin) \
+ (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
+ (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec)
+#define WT_TIMECMP(t1, t2) \
+ ((t1).tv_sec < (t2).tv_sec ? -1 : \
+ (t1).tv_sec == (t2.tv_sec) ? \
+ (t1).tv_nsec < (t2).tv_nsec ? -1 : \
+ (t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1)
+
+struct __wt_fh {
+ char *name; /* File name */
+ TAILQ_ENTRY(__wt_fh) q; /* List of open handles */
+
+ u_int ref; /* Reference count */
+
+#ifndef _WIN32
+ int fd; /* POSIX file handle */
+#else
+ HANDLE filehandle; /* Windows file handle */
+ HANDLE filehandle_secondary; /* Windows file handle
+ for file size changes */
+#endif
+ wt_off_t size; /* File size */
+ wt_off_t extend_size; /* File extended size */
+ wt_off_t extend_len; /* File extend chunk size */
+
+ int direct_io; /* O_DIRECT configured */
+
+ int fallocate_available; /* fallocate/posix_fallocate */
+ int fallocate_requires_locking;
+};
+
+#ifndef _WIN32
+#define WT_SIZET_FMT "zu" /* size_t format string */
+#else
+#define WT_SIZET_FMT "Iu" /* size_t format string */
+#endif
diff --git a/src/third_party/wiredtiger/src/include/os_windows.h b/src/third_party/wiredtiger/src/include/os_windows.h
new file mode 100644
index 00000000000..fcae531184f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/os_windows.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Define WT threading and concurrency primitives
+ * Assumes Windows 7+/2008 R2+
+ */
+typedef CONDITION_VARIABLE wt_cond_t;
+typedef CRITICAL_SECTION wt_mutex_t;
+typedef HANDLE wt_thread_t;
+typedef SRWLOCK wt_rwlock_t;
+
+/* Timespec is a POSIX structure not defined in Windows */
+struct timespec {
+ time_t tv_sec; /* seconds */
+ long tv_nsec; /* nanoseconds */
+};
+
+#define strncasecmp _strnicmp
+
+/*
+ * Windows Portability stuff
+ * These are POSIX types which Windows lacks
+ * Eventually WiredTiger will migrate away from these types
+ */
+typedef uint32_t u_int;
+typedef unsigned char u_char;
+typedef unsigned long u_long;
+
+/* < VS 2013 is not C99 compat */
+#if _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
+
+/*
+ * Windows does have ssize_t
+ * Python headers declare also though so we need to guard it
+ */
+#ifndef HAVE_SSIZE_T
+typedef int ssize_t;
+#endif
+
+/*
+ * Provide a custom version of vsnprintf that returns the
+ * needed buffer length instead of -1 on truncation
+ */
+#define vsnprintf _wt_vsnprintf
+
+_Check_return_opt_ int __cdecl _wt_vsnprintf(
+ _Out_writes_(_MaxCount) char * _DstBuf,
+ _In_ size_t _MaxCount,
+ _In_z_ _Printf_format_string_ const char * _Format,
+ va_list _ArgList);
+
+/* Provide a custom version of localtime_r */
+struct tm *localtime_r(const time_t* timer, struct tm* result);
diff --git a/src/third_party/wiredtiger/src/include/packing.i b/src/third_party/wiredtiger/src/include/packing.i
new file mode 100644
index 00000000000..6e0e7be13eb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/packing.i
@@ -0,0 +1,685 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * Throughout this code we have to be aware of default argument conversion.
+ *
+ * Refer to Chapter 8 of "Expert C Programming" by Peter van der Linden for the
+ * gory details. The short version is that we have less cases to deal with
+ * because the compiler promotes shorter types to int or unsigned int.
+ */
+typedef struct {
+ union {
+ int64_t i;
+ uint64_t u;
+ const char *s;
+ WT_ITEM item;
+ } u;
+ uint32_t size;
+ int8_t havesize;
+ char type;
+} WT_PACK_VALUE;
+
+#define WT_PACK_VALUE_INIT { { 0 }, 0, 0, 0 }
+#define WT_DECL_PACK_VALUE(pv) WT_PACK_VALUE pv = WT_PACK_VALUE_INIT
+
+typedef struct {
+ WT_SESSION_IMPL *session;
+ const char *cur, *end, *orig;
+ unsigned long repeats;
+ WT_PACK_VALUE lastv;
+} WT_PACK;
+
+#define WT_PACK_INIT { NULL, NULL, NULL, NULL, 0, WT_PACK_VALUE_INIT }
+#define WT_DECL_PACK(pack) WT_PACK pack = WT_PACK_INIT
+
+typedef struct {
+ WT_CONFIG config;
+ char buf[20];
+ int count;
+ int iskey;
+ int genname;
+} WT_PACK_NAME;
+
+/*
+ * __pack_initn --
+ * Initialize a pack iterator with the specified string and length.
+ */
+static inline int
+__pack_initn(
+ WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt, size_t len)
+{
+ if (*fmt == '@' || *fmt == '<' || *fmt == '>')
+ return (EINVAL);
+ if (*fmt == '.')
+ ++fmt;
+
+ pack->session = session;
+ pack->cur = pack->orig = fmt;
+ pack->end = fmt + len;
+ pack->repeats = 0;
+ return (0);
+}
+
+/*
+ * __pack_init --
+ * Initialize a pack iterator with the specified string.
+ */
+static inline int
+__pack_init(WT_SESSION_IMPL *session, WT_PACK *pack, const char *fmt)
+{
+ return (__pack_initn(session, pack, fmt, strlen(fmt)));
+}
+
+/*
+ * __pack_name_init --
+ * Initialize the name of a pack iterator.
+ */
+static inline int
+__pack_name_init(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *names,
+ int iskey, WT_PACK_NAME *pn)
+{
+ WT_CLEAR(*pn);
+ pn->iskey = iskey;
+
+ if (names->str != NULL)
+ WT_RET(__wt_config_subinit(session, &pn->config, names));
+ else
+ pn->genname = 1;
+
+ return (0);
+}
+
+/*
+ * __pack_name_next --
+ * Get the next field type from a pack iterator.
+ */
+static inline int
+__pack_name_next(WT_PACK_NAME *pn, WT_CONFIG_ITEM *name)
+{
+ WT_CONFIG_ITEM ignore;
+
+ if (pn->genname) {
+ (void)snprintf(pn->buf, sizeof(pn->buf),
+ (pn->iskey ? "key%d" : "value%d"), pn->count);
+ WT_CLEAR(*name);
+ name->str = pn->buf;
+ name->len = strlen(pn->buf);
+ name->type = WT_CONFIG_ITEM_STRING;
+ pn->count++;
+ }
+ else
+ WT_RET(__wt_config_next(&pn->config, name, &ignore));
+
+ return (0);
+}
+
+/*
+ * __pack_next --
+ * Next pack iterator.
+ */
+static inline int
+__pack_next(WT_PACK *pack, WT_PACK_VALUE *pv)
+{
+ char *endsize;
+
+ if (pack->repeats > 0) {
+ *pv = pack->lastv;
+ --pack->repeats;
+ return (0);
+ }
+
+next: if (pack->cur == pack->end)
+ return (WT_NOTFOUND);
+
+ if (isdigit(*pack->cur)) {
+ pv->havesize = 1;
+ pv->size = WT_STORE_SIZE(strtoul(pack->cur, &endsize, 10));
+ pack->cur = endsize;
+ } else {
+ pv->havesize = 0;
+ pv->size = 1;
+ }
+
+ pv->type = *pack->cur++;
+ pack->repeats = 0;
+
+ switch (pv->type) {
+ case 'S':
+ case 's':
+ case 'x':
+ return (0);
+ case 't':
+ if (pv->size < 1 || pv->size > 8)
+ WT_RET_MSG(pack->session, EINVAL,
+ "Bitfield sizes must be between 1 and 8 bits "
+ "in format '%.*s'",
+ (int)(pack->end - pack->orig), pack->orig);
+ return (0);
+ case 'u':
+ case 'U':
+ /* Special case for items with a size prefix. */
+ pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u';
+ return (0);
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'l':
+ case 'L':
+ case 'q':
+ case 'Q':
+ case 'r':
+ case 'R':
+ /* Integral types repeat <size> times. */
+ if (pv->size == 0)
+ goto next;
+ pack->repeats = pv->size - 1;
+ pack->lastv = *pv;
+ return (0);
+ default:
+ WT_RET_MSG(pack->session, EINVAL,
+ "Invalid type '%c' found in format '%.*s'",
+ pv->type, (int)(pack->end - pack->orig), pack->orig);
+ }
+
+}
+
+#define WT_PACK_GET(session, pv, ap) do { \
+ WT_ITEM *__item; \
+ switch (pv.type) { \
+ case 'x': \
+ break; \
+ case 's': \
+ case 'S': \
+ pv.u.s = va_arg(ap, const char *); \
+ break; \
+ case 'U': \
+ case 'u': \
+ __item = va_arg(ap, WT_ITEM *); \
+ pv.u.item.data = __item->data; \
+ pv.u.item.size = __item->size; \
+ break; \
+ case 'b': \
+ case 'h': \
+ case 'i': \
+ pv.u.i = va_arg(ap, int); \
+ break; \
+ case 'B': \
+ case 'H': \
+ case 'I': \
+ case 't': \
+ pv.u.u = va_arg(ap, unsigned int); \
+ break; \
+ case 'l': \
+ pv.u.i = va_arg(ap, long); \
+ break; \
+ case 'L': \
+ pv.u.u = va_arg(ap, unsigned long); \
+ break; \
+ case 'q': \
+ pv.u.i = va_arg(ap, int64_t); \
+ break; \
+ case 'Q': \
+ case 'r': \
+ case 'R': \
+ pv.u.u = va_arg(ap, uint64_t); \
+ break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
+ } \
+} while (0)
+
+/*
+ * __pack_size --
+ * Get the size of a packed value.
+ */
+static inline size_t
+__pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv)
+{
+ size_t s, pad;
+
+ switch (pv->type) {
+ case 'x':
+ return (pv->size);
+ case 'j':
+ case 'J':
+ if (pv->type == 'j' || pv->havesize)
+ s = pv->size;
+ else {
+ ssize_t len;
+
+ /* The string was previously validated. */
+ len = __wt_json_strlen(pv->u.item.data,
+ pv->u.item.size);
+ WT_ASSERT(session, len >= 0);
+ s = (size_t)len + 1;
+ }
+ return (s);
+ case 's':
+ case 'S':
+ if (pv->type == 's' || pv->havesize)
+ s = pv->size;
+ else
+ s = strlen(pv->u.s) + 1;
+ return (s);
+ case 'U':
+ case 'u':
+ s = pv->u.item.size;
+ pad = 0;
+ if (pv->havesize && pv->size < s)
+ s = pv->size;
+ else if (pv->havesize)
+ pad = pv->size - s;
+ if (pv->type == 'U')
+ s += __wt_vsize_uint(s + pad);
+ return (s + pad);
+ case 'b':
+ case 'B':
+ case 't':
+ return (1);
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ return (__wt_vsize_int(pv->u.i));
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ return (__wt_vsize_uint(pv->u.u));
+ case 'R':
+ return (sizeof(uint64_t));
+ }
+
+ __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type);
+ return ((size_t)-1);
+}
+
+/*
+ * __pack_write --
+ * Pack a value into a buffer.
+ */
+static inline int
+__pack_write(
+ WT_SESSION_IMPL *session, WT_PACK_VALUE *pv, uint8_t **pp, size_t maxlen)
+{
+ uint8_t *oldp;
+ size_t s, pad;
+
+ switch (pv->type) {
+ case 'x':
+ WT_SIZE_CHECK(pv->size, maxlen);
+ memset(*pp, 0, pv->size);
+ *pp += pv->size;
+ break;
+ case 's':
+ case 'S':
+ /*
+ * XXX if pv->havesize, only want to know if there is a
+ * '\0' in the first pv->size characters.
+ */
+ s = strlen(pv->u.s);
+ if ((pv->type == 's' || pv->havesize) && pv->size < s) {
+ s = pv->size;
+ pad = 0;
+ } else if (pv->havesize)
+ pad = pv->size - s;
+ else
+ pad = 1;
+ WT_SIZE_CHECK(s + pad, maxlen);
+ if (s > 0)
+ memcpy(*pp, pv->u.s, s);
+ *pp += s;
+ if (pad > 0) {
+ memset(*pp, 0, pad);
+ *pp += pad;
+ }
+ break;
+ case 'j':
+ case 'J':
+ s = pv->u.item.size;
+ if ((pv->type == 'j' || pv->havesize) && pv->size < s) {
+ s = pv->size;
+ pad = 0;
+ } else if (pv->havesize)
+ pad = pv->size - s;
+ else
+ pad = 1;
+ if (s > 0) {
+ oldp = *pp;
+ WT_RET(__wt_json_strncpy((char **)pp, maxlen,
+ pv->u.item.data, s));
+ maxlen -= (size_t)(*pp - oldp);
+ }
+ if (pad > 0) {
+ WT_SIZE_CHECK(pad, maxlen);
+ memset(*pp, 0, pad);
+ *pp += pad;
+ }
+ break;
+ case 'U':
+ case 'u':
+ s = pv->u.item.size;
+ pad = 0;
+ if (pv->havesize && pv->size < s)
+ s = pv->size;
+ else if (pv->havesize)
+ pad = pv->size - s;
+ if (pv->type == 'U') {
+ oldp = *pp;
+ WT_RET(__wt_vpack_uint(pp, maxlen, s + pad));
+ maxlen -= (size_t)(*pp - oldp);
+ }
+ WT_SIZE_CHECK(s + pad, maxlen);
+ if (s > 0)
+ memcpy(*pp, pv->u.item.data, s);
+ *pp += s;
+ if (pad > 0) {
+ memset(*pp, 0, pad);
+ *pp += pad;
+ }
+ break;
+ case 'b':
+ /* Translate to maintain ordering with the sign bit. */
+ WT_SIZE_CHECK(1, maxlen);
+ **pp = (uint8_t)(pv->u.i + 0x80);
+ *pp += 1;
+ break;
+ case 'B':
+ case 't':
+ WT_SIZE_CHECK(1, maxlen);
+ **pp = (uint8_t)pv->u.u;
+ *pp += 1;
+ break;
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__wt_vpack_int(pp, maxlen, pv->u.i));
+ break;
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ WT_RET(__wt_vpack_uint(pp, maxlen, pv->u.u));
+ break;
+ case 'R':
+ WT_SIZE_CHECK(sizeof(uint64_t), maxlen);
+ *(uint64_t *)*pp = pv->u.u;
+ *pp += sizeof(uint64_t);
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unknown pack-value type: %c", (int)pv->type);
+ }
+
+ return (0);
+}
+
+/*
+ * __unpack_read --
+ * Read a packed value from a buffer.
+ */
+static inline int
+__unpack_read(WT_SESSION_IMPL *session,
+ WT_PACK_VALUE *pv, const uint8_t **pp, size_t maxlen)
+{
+ size_t s;
+
+ switch (pv->type) {
+ case 'x':
+ WT_SIZE_CHECK(pv->size, maxlen);
+ *pp += pv->size;
+ break;
+ case 's':
+ case 'S':
+ if (pv->type == 's' || pv->havesize)
+ s = pv->size;
+ else
+ s = strlen((const char *)*pp) + 1;
+ if (s > 0)
+ pv->u.s = (const char *)*pp;
+ WT_SIZE_CHECK(s, maxlen);
+ *pp += s;
+ break;
+ case 'U':
+ WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u));
+ /* FALLTHROUGH */
+ case 'u':
+ if (pv->havesize)
+ s = pv->size;
+ else if (pv->type == 'U')
+ s = (size_t)pv->u.u;
+ else
+ s = maxlen;
+ WT_SIZE_CHECK(s, maxlen);
+ pv->u.item.data = *pp;
+ pv->u.item.size = s;
+ *pp += s;
+ break;
+ case 'b':
+ /* Translate to maintain ordering with the sign bit. */
+ WT_SIZE_CHECK(1, maxlen);
+ pv->u.i = (int8_t)(*(*pp)++ - 0x80);
+ break;
+ case 'B':
+ case 't':
+ WT_SIZE_CHECK(1, maxlen);
+ pv->u.u = *(*pp)++;
+ break;
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__wt_vunpack_int(pp, maxlen, &pv->u.i));
+ break;
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'r':
+ WT_RET(__wt_vunpack_uint(pp, maxlen, &pv->u.u));
+ break;
+ case 'R':
+ WT_SIZE_CHECK(sizeof(uint64_t), maxlen);
+ pv->u.u = *(uint64_t *)*pp;
+ *pp += sizeof(uint64_t);
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unknown pack-value type: %c", (int)pv->type);
+ }
+
+ return (0);
+}
+
+#define WT_UNPACK_PUT(session, pv, ap) do { \
+ WT_ITEM *__item; \
+ switch (pv.type) { \
+ case 'x': \
+ break; \
+ case 's': \
+ case 'S': \
+ *va_arg(ap, const char **) = pv.u.s; \
+ break; \
+ case 'U': \
+ case 'u': \
+ __item = va_arg(ap, WT_ITEM *); \
+ __item->data = pv.u.item.data; \
+ __item->size = pv.u.item.size; \
+ break; \
+ case 'b': \
+ *va_arg(ap, int8_t *) = (int8_t)pv.u.i; \
+ break; \
+ case 'h': \
+ *va_arg(ap, int16_t *) = (short)pv.u.i; \
+ break; \
+ case 'i': \
+ case 'l': \
+ *va_arg(ap, int32_t *) = (int32_t)pv.u.i; \
+ break; \
+ case 'q': \
+ *va_arg(ap, int64_t *) = pv.u.i; \
+ break; \
+ case 'B': \
+ case 't': \
+ *va_arg(ap, uint8_t *) = (uint8_t)pv.u.u; \
+ break; \
+ case 'H': \
+ *va_arg(ap, uint16_t *) = (uint16_t)pv.u.u; \
+ break; \
+ case 'I': \
+ case 'L': \
+ *va_arg(ap, uint32_t *) = (uint32_t)pv.u.u; \
+ break; \
+ case 'Q': \
+ case 'r': \
+ case 'R': \
+ *va_arg(ap, uint64_t *) = pv.u.u; \
+ break; \
+ /* User format strings have already been validated. */ \
+ WT_ILLEGAL_VALUE(session); \
+ } \
+} while (0)
+
+/*
+ * __wt_struct_packv --
+ * Pack a byte string (va_list version).
+ */
+static inline int
+__wt_struct_packv(WT_SESSION_IMPL *session,
+ void *buffer, size_t size, const char *fmt, va_list ap)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ pv.type = fmt[0];
+ WT_PACK_GET(session, pv, ap);
+ return (__pack_write(session, &pv, &p, size));
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ WT_PACK_GET(session, pv, ap);
+ WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p)));
+ }
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_sizev --
+ * Calculate the size of a packed byte string (va_list version).
+ */
+static inline int
+__wt_struct_sizev(
+ WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, va_list ap)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_PACK pack;
+ size_t total;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ pv.type = fmt[0];
+ WT_PACK_GET(session, pv, ap);
+ *sizep = __pack_size(session, &pv);
+ return (0);
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ for (total = 0; __pack_next(&pack, &pv) == 0;) {
+ WT_PACK_GET(session, pv, ap);
+ total += __pack_size(session, &pv);
+ }
+ *sizep = total;
+ return (0);
+}
+
+/*
+ * __wt_struct_unpackv --
+ * Unpack a byte string (va_list version).
+ */
+static inline int
+__wt_struct_unpackv(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, va_list ap)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ const uint8_t *p, *end;
+
+ p = buffer;
+ end = p + size;
+
+ if (fmt[0] != '\0' && fmt[1] == '\0') {
+ pv.type = fmt[0];
+ if ((ret = __unpack_read(session, &pv, &p, size)) == 0)
+ WT_UNPACK_PUT(session, pv, ap);
+ return (0);
+ }
+
+ WT_RET(__pack_init(session, &pack, fmt));
+ while ((ret = __pack_next(&pack, &pv)) == 0) {
+ WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p)));
+ WT_UNPACK_PUT(session, pv, ap);
+ }
+
+ /* Be paranoid - __pack_write should never overflow. */
+ WT_ASSERT(session, p <= end);
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_size_adjust --
+ * Adjust the size field for a packed structure.
+ *
+ * Sometimes we want to include the size as a field in a packed structure.
+ * This is done by calling __wt_struct_size with the expected format and
+ * a size of zero. Then we want to pack the structure using the final
+ * size. This function adjusts the size appropriately (taking into
+ * account the size of the final size or the size field itself).
+ */
+static inline void
+__wt_struct_size_adjust(WT_SESSION_IMPL *session, size_t *sizep)
+{
+ size_t prev_size = 1;
+ size_t orig_size = *sizep;
+ size_t field_size0 = __wt_vsize_uint(orig_size);
+ size_t field_size1 =
+ __wt_vsize_uint(orig_size + field_size0 - prev_size);
+ *sizep += field_size1 - prev_size;
+
+ /*
+ * Make sure the field size we calculated matches the adjusted size.
+ * This relies on the fact that we are only adjusting by a small number
+ * of bytes, so we won't cross multiple boundaries in the packing
+ * routine. If that were not true, we would need to iterate here until
+ * the field size stops growing.
+ */
+ WT_ASSERT(session, field_size1 == __wt_vsize_uint(*sizep));
+}
diff --git a/src/third_party/wiredtiger/src/include/posix.h b/src/third_party/wiredtiger/src/include/posix.h
new file mode 100644
index 00000000000..e3b43ea38ab
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/posix.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Some systems don't configure 64-bit MIN/MAX by default. */
+#ifndef ULLONG_MAX
+#define ULLONG_MAX 0xffffffffffffffffULL
+#endif
+#ifndef LLONG_MAX
+#define LLONG_MAX 0x7fffffffffffffffLL
+#endif
+#ifndef LLONG_MIN
+#define LLONG_MIN (-0x7fffffffffffffffLL - 1)
+#endif
+
+/* Define O_BINARY for Posix systems */
+#define O_BINARY 0
+
+/*
+ * Define WT threading and concurrency primitives
+ */
+typedef pthread_cond_t wt_cond_t;
+typedef pthread_mutex_t wt_mutex_t;
+typedef pthread_t wt_thread_t;
+
+/*
+ * !!!
+ * Don't touch this structure without understanding the read/write
+ * locking functions.
+ */
+typedef union { /* Read/write lock */
+#ifdef WORDS_BIGENDIAN
+ WiredTiger read/write locks require modification for big-endian systems.
+#else
+ uint64_t u;
+ uint32_t us;
+ struct {
+ uint16_t writers;
+ uint16_t readers;
+ uint16_t users;
+ uint16_t pad;
+ } s;
+#endif
+} wt_rwlock_t;
diff --git a/src/third_party/wiredtiger/src/include/queue.h b/src/third_party/wiredtiger/src/include/queue.h
new file mode 100644
index 00000000000..42e736e7b09
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/queue.h
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)queue.h 8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.54 2002/08/05 05:18:43 alfred Exp $
+ */
+
+#ifndef _DB_QUEUE_H_
+#define _DB_QUEUE_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * This file defines four types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists and tail queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction. Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ * SLIST LIST STAILQ TAILQ
+ * _HEAD + + + +
+ * _HEAD_INITIALIZER + + + +
+ * _ENTRY + + + +
+ * _INIT + + + +
+ * _EMPTY + + + +
+ * _FIRST + + + +
+ * _NEXT + + + +
+ * _PREV - - - +
+ * _LAST - - + +
+ * _FOREACH + + + +
+ * _FOREACH_REVERSE - - - +
+ * _INSERT_HEAD + + + +
+ * _INSERT_BEFORE - + - +
+ * _INSERT_AFTER + + + +
+ * _INSERT_TAIL - - + +
+ * _CONCAT - - + +
+ * _REMOVE_HEAD + - + -
+ * _REMOVE + + + +
+ *
+ */
+
+/*
+ * XXX
+ * We #undef all of the macros because there are incompatible versions of this
+ * file and these macros on various systems. What makes the problem worse is
+ * they are included and/or defined by system include files which we may have
+ * already loaded into Berkeley DB before getting here. For example, FreeBSD's
+ * <rpc/rpc.h> includes its system <sys/queue.h>, and VxWorks UnixLib.h defines
+ * several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these
+ * same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours.
+ */
+#undef LIST_EMPTY
+#undef LIST_ENTRY
+#undef LIST_FIRST
+#undef LIST_FOREACH
+#undef LIST_HEAD
+#undef LIST_HEAD_INITIALIZER
+#undef LIST_INIT
+#undef LIST_INSERT_AFTER
+#undef LIST_INSERT_BEFORE
+#undef LIST_INSERT_HEAD
+#undef LIST_NEXT
+#undef LIST_REMOVE
+#undef QMD_TRACE_ELEM
+#undef QMD_TRACE_HEAD
+#undef QUEUE_MACRO_DEBUG
+#undef SLIST_EMPTY
+#undef SLIST_ENTRY
+#undef SLIST_FIRST
+#undef SLIST_FOREACH
+#undef SLIST_FOREACH_PREVPTR
+#undef SLIST_HEAD
+#undef SLIST_HEAD_INITIALIZER
+#undef SLIST_INIT
+#undef SLIST_INSERT_AFTER
+#undef SLIST_INSERT_HEAD
+#undef SLIST_NEXT
+#undef SLIST_REMOVE
+#undef SLIST_REMOVE_HEAD
+#undef STAILQ_CONCAT
+#undef STAILQ_EMPTY
+#undef STAILQ_ENTRY
+#undef STAILQ_FIRST
+#undef STAILQ_FOREACH
+#undef STAILQ_HEAD
+#undef STAILQ_HEAD_INITIALIZER
+#undef STAILQ_INIT
+#undef STAILQ_INSERT_AFTER
+#undef STAILQ_INSERT_HEAD
+#undef STAILQ_INSERT_TAIL
+#undef STAILQ_LAST
+#undef STAILQ_NEXT
+#undef STAILQ_REMOVE
+#undef STAILQ_REMOVE_HEAD
+#undef STAILQ_REMOVE_HEAD_UNTIL
+#undef TAILQ_CONCAT
+#undef TAILQ_EMPTY
+#undef TAILQ_ENTRY
+#undef TAILQ_FIRST
+#undef TAILQ_FOREACH
+#undef TAILQ_FOREACH_REVERSE
+#undef TAILQ_HEAD
+#undef TAILQ_HEAD_INITIALIZER
+#undef TAILQ_INIT
+#undef TAILQ_INSERT_AFTER
+#undef TAILQ_INSERT_BEFORE
+#undef TAILQ_INSERT_HEAD
+#undef TAILQ_INSERT_TAIL
+#undef TAILQ_LAST
+#undef TAILQ_NEXT
+#undef TAILQ_PREV
+#undef TAILQ_REMOVE
+#undef TRACEBUF
+#undef TRASHIT
+
+#define QUEUE_MACRO_DEBUG 0
+#if QUEUE_MACRO_DEBUG
+/* Store the last 2 places the queue element or head was altered */
+struct qm_trace {
+ char * lastfile;
+ int lastline;
+ char * prevfile;
+ int prevline;
+};
+
+#define TRACEBUF struct qm_trace trace;
+#define TRASHIT(x) do {(x) = (void *)-1;} while (0)
+
+#define QMD_TRACE_HEAD(head) do { \
+ (head)->trace.prevline = (head)->trace.lastline; \
+ (head)->trace.prevfile = (head)->trace.lastfile; \
+ (head)->trace.lastline = __LINE__; \
+ (head)->trace.lastfile = __FILE__; \
+} while (0)
+
+#define QMD_TRACE_ELEM(elem) do { \
+ (elem)->trace.prevline = (elem)->trace.lastline; \
+ (elem)->trace.prevfile = (elem)->trace.lastfile; \
+ (elem)->trace.lastline = __LINE__; \
+ (elem)->trace.lastfile = __FILE__; \
+} while (0)
+
+#else
+#define QMD_TRACE_ELEM(elem)
+#define QMD_TRACE_HEAD(head)
+#define TRACEBUF
+#define TRASHIT(x)
+#endif /* QUEUE_MACRO_DEBUG */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define SLIST_HEAD(name, type) \
+struct name { \
+ struct type *slh_first; /* first element */ \
+}
+
+#define SLIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define SLIST_ENTRY(type) \
+struct { \
+ struct type *sle_next; /* next element */ \
+}
+
+/*
+ * Singly-linked List functions.
+ */
+#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
+
+#define SLIST_FIRST(head) ((head)->slh_first)
+
+#define SLIST_FOREACH(var, head, field) \
+ for ((var) = SLIST_FIRST((head)); \
+ (var); \
+ (var) = SLIST_NEXT((var), field))
+
+#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \
+ for ((varp) = &SLIST_FIRST((head)); \
+ ((var) = *(varp)) != NULL; \
+ (varp) = &SLIST_NEXT((var), field))
+
+#define SLIST_INIT(head) do { \
+ SLIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \
+ SLIST_NEXT((slistelm), field) = (elm); \
+} while (0)
+
+#define SLIST_INSERT_HEAD(head, elm, field) do { \
+ SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \
+ SLIST_FIRST((head)) = (elm); \
+} while (0)
+
+#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
+
+#define SLIST_REMOVE(head, elm, type, field) do { \
+ if (SLIST_FIRST((head)) == (elm)) { \
+ SLIST_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = SLIST_FIRST((head)); \
+ while (SLIST_NEXT(curelm, field) != (elm)) \
+ curelm = SLIST_NEXT(curelm, field); \
+ SLIST_NEXT(curelm, field) = \
+ SLIST_NEXT(SLIST_NEXT(curelm, field), field); \
+ } \
+} while (0)
+
+#define SLIST_REMOVE_HEAD(head, field) do { \
+ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define STAILQ_HEAD(name, type) \
+struct name { \
+ struct type *stqh_first;/* first element */ \
+ struct type **stqh_last;/* addr of last next element */ \
+}
+
+#define STAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).stqh_first }
+
+#define STAILQ_ENTRY(type) \
+struct { \
+ struct type *stqe_next; /* next element */ \
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define STAILQ_CONCAT(head1, head2) do { \
+ if (!STAILQ_EMPTY((head2))) { \
+ *(head1)->stqh_last = (head2)->stqh_first; \
+ (head1)->stqh_last = (head2)->stqh_last; \
+ STAILQ_INIT((head2)); \
+ } \
+} while (0)
+
+#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
+
+#define STAILQ_FIRST(head) ((head)->stqh_first)
+
+#define STAILQ_FOREACH(var, head, field) \
+ for ((var) = STAILQ_FIRST((head)); \
+ (var); \
+ (var) = STAILQ_NEXT((var), field))
+
+#define STAILQ_INIT(head) do { \
+ STAILQ_FIRST((head)) = NULL; \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_NEXT((tqelm), field) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+ STAILQ_FIRST((head)) = (elm); \
+} while (0)
+
+#define STAILQ_INSERT_TAIL(head, elm, field) do { \
+ STAILQ_NEXT((elm), field) = NULL; \
+ *(head)->stqh_last = (elm); \
+ (head)->stqh_last = &STAILQ_NEXT((elm), field); \
+} while (0)
+
+#define STAILQ_LAST(head, type, field) \
+ (STAILQ_EMPTY((head)) ? \
+ NULL : \
+ ((struct type *) \
+ ((char *)((head)->stqh_last) - __offsetof(struct type, field))))
+
+#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
+
+#define STAILQ_REMOVE(head, elm, type, field) do { \
+ if (STAILQ_FIRST((head)) == (elm)) { \
+ STAILQ_REMOVE_HEAD((head), field); \
+ } \
+ else { \
+ struct type *curelm = STAILQ_FIRST((head)); \
+ while (STAILQ_NEXT(curelm, field) != (elm)) \
+ curelm = STAILQ_NEXT(curelm, field); \
+ if ((STAILQ_NEXT(curelm, field) = \
+ STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+ (head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+ } \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD(head, field) do { \
+ if ((STAILQ_FIRST((head)) = \
+ STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \
+ if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \
+ (head)->stqh_last = &STAILQ_FIRST((head)); \
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+#define LIST_HEAD_INITIALIZER(head) \
+ { NULL }
+
+#define LIST_ENTRY(type) \
+struct { \
+ struct type *le_next; /* next element */ \
+ struct type **le_prev; /* address of previous next element */ \
+}
+
+/*
+ * List functions.
+ */
+
+#define LIST_EMPTY(head) ((head)->lh_first == NULL)
+
+#define LIST_FIRST(head) ((head)->lh_first)
+
+#define LIST_FOREACH(var, head, field) \
+ for ((var) = LIST_FIRST((head)); \
+ (var); \
+ (var) = LIST_NEXT((var), field))
+
+#define LIST_INIT(head) do { \
+ LIST_FIRST((head)) = NULL; \
+} while (0)
+
+#define LIST_INSERT_AFTER(listelm, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+ LIST_NEXT((listelm), field)->field.le_prev = \
+ &LIST_NEXT((elm), field); \
+ LIST_NEXT((listelm), field) = (elm); \
+ (elm)->field.le_prev = &LIST_NEXT((listelm), field); \
+} while (0)
+
+#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.le_prev = (listelm)->field.le_prev; \
+ LIST_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.le_prev = (elm); \
+ (listelm)->field.le_prev = &LIST_NEXT((elm), field); \
+} while (0)
+
+#define LIST_INSERT_HEAD(head, elm, field) do { \
+ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \
+ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+ LIST_FIRST((head)) = (elm); \
+ (elm)->field.le_prev = &LIST_FIRST((head)); \
+} while (0)
+
+#define LIST_NEXT(elm, field) ((elm)->field.le_next)
+
+#define LIST_REMOVE(elm, field) do { \
+ if (LIST_NEXT((elm), field) != NULL) \
+ LIST_NEXT((elm), field)->field.le_prev = \
+ (elm)->field.le_prev; \
+ *(elm)->field.le_prev = LIST_NEXT((elm), field); \
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define TAILQ_HEAD(name, type) \
+struct name { \
+ struct type *tqh_first; /* first element */ \
+ struct type **tqh_last; /* addr of last next element */ \
+ TRACEBUF \
+}
+
+#define TAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).tqh_first }
+
+#define TAILQ_ENTRY(type) \
+struct { \
+ struct type *tqe_next; /* next element */ \
+ struct type **tqe_prev; /* address of previous next element */ \
+ TRACEBUF \
+}
+
+/*
+ * Tail queue functions.
+ */
+#define TAILQ_CONCAT(head1, head2, field) do { \
+ if (!TAILQ_EMPTY(head2)) { \
+ *(head1)->tqh_last = (head2)->tqh_first; \
+ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
+ (head1)->tqh_last = (head2)->tqh_last; \
+ TAILQ_INIT((head2)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_HEAD(head2); \
+ } \
+} while (0)
+
+#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
+
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_INIT(head) do { \
+ TAILQ_FIRST((head)) = NULL; \
+ (head)->tqh_last = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+} while (0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else { \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ } \
+ TAILQ_NEXT((listelm), field) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&listelm->field); \
+} while (0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ TAILQ_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&listelm->field); \
+} while (0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
+ TAILQ_FIRST((head))->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_FIRST((head)) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ TAILQ_NEXT((elm), field) = NULL; \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field)) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else { \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ QMD_TRACE_HEAD(head); \
+ } \
+ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
+ TRASHIT((elm)->field.tqe_next); \
+ TRASHIT((elm)->field.tqe_prev); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* !_DB_QUEUE_H_ */
diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h
new file mode 100644
index 00000000000..e24a19b03ca
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/schema.h
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/* Character constants for projection plans */
+#define WT_PROJ_KEY 'k' /* Go to key in cursor <arg> */
+#define WT_PROJ_NEXT 'n' /* Process the next item (<arg> repeats) */
+#define WT_PROJ_REUSE 'r' /* Reuse the previous item (<arg> repeats) */
+#define WT_PROJ_SKIP 's' /* Skip a column in the cursor (<arg> repeats) */
+#define WT_PROJ_VALUE 'v' /* Go to the value in cursor <arg> */
+
+struct __wt_colgroup {
+ const char *name; /* Logical name */
+ const char *source; /* Underlying data source */
+ const char *config; /* Configuration string */
+
+ WT_CONFIG_ITEM colconf; /* List of columns from config */
+};
+
+struct __wt_index {
+ const char *name; /* Logical name */
+ const char *source; /* Underlying data source */
+ const char *config; /* Configuration string */
+
+ WT_CONFIG_ITEM colconf; /* List of columns from config */
+
+ const char *idxkey_format; /* Index key format (hides primary) */
+ const char *key_format; /* Key format */
+ const char *key_plan; /* Key projection plan */
+ const char *value_plan; /* Value projection plan */
+};
+
+/*
+ * WT_TABLE --
+ * Handle for a logical table. A table consists of one or more column
+ * groups, each of which holds some set of columns all sharing a primary
+ * key; and zero or more indices, each of which holds some set of columns
+ * in an index key that can be used to reconstruct the primary key.
+ */
+struct __wt_table {
+ const char *name, *config, *plan;
+ const char *key_format, *value_format;
+
+ WT_CONFIG_ITEM cgconf, colconf;
+
+ WT_COLGROUP **cgroups;
+ WT_INDEX **indices;
+ size_t idx_alloc;
+
+ TAILQ_ENTRY(__wt_table) q;
+
+ int cg_complete, idx_complete, is_simple;
+ u_int ncolgroups, nindices, nkey_columns;
+
+ uint32_t refcnt; /* Number of open cursors */
+ uint32_t schema_gen; /* Cached schema generation number */
+};
+
+/*
+ * Tables without explicit column groups have a single default column group
+ * containing all of the columns.
+ */
+#define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1)
+
+/*
+ * WT_WITH_SCHEMA_LOCK --
+ * Acquire the schema lock, perform an operation, drop the lock.
+ */
+#define WT_WITH_SCHEMA_LOCK(session, op) do { \
+ WT_ASSERT(session, \
+ F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) || \
+ !F_ISSET(session, WT_SESSION_NO_SCHEMA_LOCK)); \
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \
+ (op); \
+ } else { \
+ __wt_spin_lock(session, &S2C(session)->schema_lock); \
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED); \
+ (op); \
+ __wt_spin_unlock(session, &S2C(session)->schema_lock); \
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \
+ } \
+} while (0)
+
+/*
+ * WT_WITHOUT_SCHEMA_LOCK --
+ * Drop the schema lock, perform an operation, re-acquire the lock.
+ */
+#define WT_WITHOUT_SCHEMA_LOCK(session, op) do { \
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \
+ __wt_spin_unlock(session, &S2C(session)->schema_lock); \
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \
+ (op); \
+ __wt_spin_lock(session, &S2C(session)->schema_lock); \
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED); \
+ } else { \
+ (op); \
+ } \
+} while (0)
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
new file mode 100644
index 00000000000..70dc6b8764d
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -0,0 +1,329 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __page_write_gen_wrapped_check --
+ * Confirm the page's write generation number won't wrap.
+ */
+static inline int
+__page_write_gen_wrapped_check(WT_PAGE *page)
+{
+ return (page->modify->write_gen >
+ UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
+}
+
+/*
+ * __insert_serial_func --
+ * Worker function to add a WT_INSERT entry to a skiplist.
+ */
+static inline int
+__insert_serial_func(WT_SESSION_IMPL *session,
+ WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins,
+ u_int skipdepth)
+{
+ u_int i;
+
+ WT_UNUSED(session);
+
+ /*
+ * Confirm we are still in the expected position, and no item has been
+ * added where our insert belongs. Take extra care at the beginning
+ * and end of the list (at each level): retry if we race there.
+ *
+ * !!!
+ * Note the test for ins_stack[0] == NULL: that's the test for an
+ * uninitialized cursor, ins_stack[0] is cleared as part of
+ * initializing a cursor for a search.
+ */
+ for (i = 0; i < skipdepth; i++) {
+ if (ins_stack[i] == NULL ||
+ *ins_stack[i] != new_ins->next[i])
+ return (WT_RESTART);
+ if (new_ins->next[i] == NULL &&
+ ins_head->tail[i] != NULL &&
+ ins_stack[i] != &ins_head->tail[i]->next[i])
+ return (WT_RESTART);
+ }
+
+ /* Update the skiplist elements referencing the new WT_INSERT item. */
+ for (i = 0; i < skipdepth; i++) {
+ if (ins_head->tail[i] == NULL ||
+ ins_stack[i] == &ins_head->tail[i]->next[i])
+ ins_head->tail[i] = new_ins;
+ *ins_stack[i] = new_ins;
+ }
+
+ return (0);
+}
+
+/*
+ * __col_append_serial_func --
+ * Worker function to allocate a record number as necessary, then add a
+ * WT_INSERT entry to a skiplist.
+ */
+static inline int
+__col_append_serial_func(WT_SESSION_IMPL *session,
+ WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT *new_ins,
+ uint64_t *recnop, u_int skipdepth)
+{
+ WT_BTREE *btree;
+ uint64_t recno;
+ u_int i;
+
+ btree = S2BT(session);
+
+ /*
+ * If the application didn't specify a record number, allocate a new one
+ * and set up for an append.
+ */
+ if ((recno = WT_INSERT_RECNO(new_ins)) == 0) {
+ recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1;
+ WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL ||
+ recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head)));
+ for (i = 0; i < skipdepth; i++)
+ ins_stack[i] = ins_head->tail[i] == NULL ?
+ &ins_head->head[i] : &ins_head->tail[i]->next[i];
+ }
+
+ /* Confirm position and insert the new WT_INSERT item. */
+ WT_RET(__insert_serial_func(
+ session, ins_head, ins_stack, new_ins, skipdepth));
+
+ /*
+ * Set the calling cursor's record number.
+ * If we extended the file, update the last record number.
+ */
+ *recnop = recno;
+ if (recno > btree->last_recno)
+ btree->last_recno = recno;
+
+ return (0);
+}
+
+/*
+ * __update_serial_func --
+ * Worker function to add an WT_UPDATE entry in the page array.
+ */
+static inline int
+__update_serial_func(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_UPDATE **upd_entry, WT_UPDATE *upd)
+{
+ WT_DECL_RET;
+ WT_UPDATE *obsolete;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ /*
+ * Swap the update into place. If that fails, a new update was added
+ * after our search, we raced. Check if our update is still permitted,
+ * and if it is, do a full-barrier to ensure the update's next pointer
+ * is set before we update the linked list and try again.
+ */
+ while (!WT_ATOMIC_CAS8(*upd_entry, upd->next, upd)) {
+ WT_RET(__wt_txn_update_check(session, upd->next = *upd_entry));
+ WT_WRITE_BARRIER();
+ }
+
+ /*
+ * If there are subsequent WT_UPDATE structures, we're evicting pages
+ * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
+ * structures. Serialization is needed so only one thread does the
+ * obsolete check at a time, and to protect updates from disappearing
+ * under reconciliation.
+ */
+ if (upd->next != NULL &&
+ F_ISSET(S2C(session)->cache, WT_EVICT_ACTIVE)) {
+ F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
+ /* If we can't lock it, don't scan, that's okay. */
+ if (ret != 0)
+ return (0);
+ obsolete = __wt_update_obsolete_check(session, upd->next);
+ F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
+ if (obsolete != NULL)
+ __wt_update_obsolete_free(session, page, obsolete);
+ }
+ return (0);
+}
+
+/*
+ * DO NOT EDIT: automatically built by dist/serial.py.
+ * Serialization function section: BEGIN
+ */
+
+static inline int
+__wt_col_append_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head,
+ WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
+ uint64_t *recnop, u_int skipdepth)
+{
+ WT_INSERT *new_ins = *new_insp;
+ WT_DECL_RET;
+ size_t incr_mem;
+
+ /* Clear references to memory we now own. */
+ *new_insp = NULL;
+
+ /*
+ * Check to see if the page's write generation is about to wrap (wildly
+ * unlikely as it implies 4B updates between clean page reconciliations,
+ * but technically possible), and fail the update.
+ *
+ * The check is outside of the serialization mutex because the page's
+ * write generation is going to be a hot cache line, so technically it's
+ * possible for the page's write generation to wrap between the test and
+ * our subsequent modification of it. However, the test is (4B-1M), and
+ * there cannot be a million threads that have done the test but not yet
+ * completed their modification.
+ */
+ WT_RET(__page_write_gen_wrapped_check(page));
+
+ /* Acquire the page's spinlock, call the worker function. */
+ WT_PAGE_LOCK(session, page);
+ ret = __col_append_serial_func(
+ session, ins_head, ins_stack, new_ins, recnop, skipdepth);
+ WT_PAGE_UNLOCK(session, page);
+
+ /* Free unused memory on error. */
+ if (ret != 0) {
+ __wt_free(session, new_ins);
+
+ return (ret);
+ }
+
+ /*
+ * Increment in-memory footprint after releasing the mutex: that's safe
+ * because the structures we added cannot be discarded while visible to
+ * any running transaction, and we're a running transaction, which means
+ * there can be no corresponding delete until we complete.
+ */
+ incr_mem = 0;
+ WT_ASSERT(session, new_ins_size != 0);
+ incr_mem += new_ins_size;
+ if (incr_mem != 0)
+ __wt_cache_page_inmem_incr(session, page, incr_mem);
+
+ /* Mark the page dirty after updating the footprint. */
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+static inline int
+__wt_insert_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head,
+ WT_INSERT ***ins_stack, WT_INSERT **new_insp, size_t new_ins_size,
+ u_int skipdepth)
+{
+ WT_INSERT *new_ins = *new_insp;
+ WT_DECL_RET;
+ size_t incr_mem;
+
+ /* Clear references to memory we now own. */
+ *new_insp = NULL;
+
+ /*
+ * Check to see if the page's write generation is about to wrap (wildly
+ * unlikely as it implies 4B updates between clean page reconciliations,
+ * but technically possible), and fail the update.
+ *
+ * The check is outside of the serialization mutex because the page's
+ * write generation is going to be a hot cache line, so technically it's
+ * possible for the page's write generation to wrap between the test and
+ * our subsequent modification of it. However, the test is (4B-1M), and
+ * there cannot be a million threads that have done the test but not yet
+ * completed their modification.
+ */
+ WT_RET(__page_write_gen_wrapped_check(page));
+
+ /* Acquire the page's spinlock, call the worker function. */
+ WT_PAGE_LOCK(session, page);
+ ret = __insert_serial_func(
+ session, ins_head, ins_stack, new_ins, skipdepth);
+ WT_PAGE_UNLOCK(session, page);
+
+ /* Free unused memory on error. */
+ if (ret != 0) {
+ __wt_free(session, new_ins);
+
+ return (ret);
+ }
+
+ /*
+ * Increment in-memory footprint after releasing the mutex: that's safe
+ * because the structures we added cannot be discarded while visible to
+ * any running transaction, and we're a running transaction, which means
+ * there can be no corresponding delete until we complete.
+ */
+ incr_mem = 0;
+ WT_ASSERT(session, new_ins_size != 0);
+ incr_mem += new_ins_size;
+ if (incr_mem != 0)
+ __wt_cache_page_inmem_incr(session, page, incr_mem);
+
+ /* Mark the page dirty after updating the footprint. */
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+static inline int
+__wt_update_serial(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE **srch_upd,
+ WT_UPDATE **updp, size_t upd_size)
+{
+ WT_UPDATE *upd = *updp;
+ WT_DECL_RET;
+ size_t incr_mem;
+
+ /* Clear references to memory we now own. */
+ *updp = NULL;
+
+ /*
+ * Check to see if the page's write generation is about to wrap (wildly
+ * unlikely as it implies 4B updates between clean page reconciliations,
+ * but technically possible), and fail the update.
+ *
+ * The check is outside of the serialization mutex because the page's
+ * write generation is going to be a hot cache line, so technically it's
+ * possible for the page's write generation to wrap between the test and
+ * our subsequent modification of it. However, the test is (4B-1M), and
+ * there cannot be a million threads that have done the test but not yet
+ * completed their modification.
+ */
+ WT_RET(__page_write_gen_wrapped_check(page));
+
+ ret = __update_serial_func(
+ session, page, srch_upd, upd);
+
+ /* Free unused memory on error. */
+ if (ret != 0) {
+ __wt_free(session, upd);
+
+ return (ret);
+ }
+
+ /*
+ * Increment in-memory footprint after releasing the mutex: that's safe
+ * because the structures we added cannot be discarded while visible to
+ * any running transaction, and we're a running transaction, which means
+ * there can be no corresponding delete until we complete.
+ */
+ incr_mem = 0;
+ WT_ASSERT(session, upd_size != 0);
+ incr_mem += upd_size;
+ if (incr_mem != 0)
+ __wt_cache_page_inmem_incr(session, page, incr_mem);
+
+ /* Mark the page dirty after updating the footprint. */
+ __wt_page_modify_set(session, page);
+
+ return (0);
+}
+
+/*
+ * Serialization function section: END
+ * DO NOT EDIT: automatically built by dist/serial.py.
+ */
diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h
new file mode 100644
index 00000000000..788ffe5eb45
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/session.h
@@ -0,0 +1,156 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * WT_DATA_HANDLE_CACHE --
+ * Per-session cache of handles to avoid synchronization when opening
+ * cursors.
+ */
+struct __wt_data_handle_cache {
+ WT_DATA_HANDLE *dhandle;
+
+ SLIST_ENTRY(__wt_data_handle_cache) l;
+};
+
+/*
+ * WT_HAZARD --
+ * A hazard pointer.
+ */
+struct __wt_hazard {
+ WT_PAGE *page; /* Page address */
+#ifdef HAVE_DIAGNOSTIC
+ const char *file; /* File/line where hazard acquired */
+ int line;
+#endif
+};
+
+/* Get the connection implementation for a session */
+#define S2C(session) ((WT_CONNECTION_IMPL *)(session)->iface.connection)
+#define S2C_SAFE(session) ((session) == NULL ? NULL : S2C(session))
+
+/* Get the btree for a session */
+#define S2BT(session) ((WT_BTREE *)(session)->dhandle->handle)
+#define S2BT_SAFE(session) ((session)->dhandle == NULL ? NULL : S2BT(session))
+
+/*
+ * WT_SESSION_IMPL --
+ * Implementation of WT_SESSION.
+ */
+struct __wt_session_impl {
+ WT_SESSION iface;
+
+ void *lang_private; /* Language specific private storage */
+
+ u_int active; /* Non-zero if the session is in-use */
+
+ const char *name; /* Name */
+ const char *lastop; /* Last operation */
+ uint32_t id; /* UID, offset in session array */
+
+ WT_CONDVAR *cond; /* Condition variable */
+
+ uint32_t rnd[2]; /* Random number generation state */
+
+ WT_EVENT_HANDLER *event_handler;/* Application's event handlers */
+
+ WT_DATA_HANDLE *dhandle; /* Current data handle */
+
+ /* Session handle reference list */
+ SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
+#define WT_DHANDLE_SWEEP_WAIT 60 /* Wait before discarding */
+#define WT_DHANDLE_SWEEP_PERIOD 20 /* Only sweep every 20 seconds */
+ time_t last_sweep; /* Last sweep for dead handles */
+
+ WT_CURSOR *cursor; /* Current cursor */
+ /* Cursors closed with the session */
+ TAILQ_HEAD(__cursors, __wt_cursor) cursors;
+
+ WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
+ WT_COMPACT *compact; /* Compact state */
+
+ WT_BTREE *metafile; /* Metadata file */
+ void *meta_track; /* Metadata operation tracking */
+ void *meta_track_next; /* Current position */
+ void *meta_track_sub; /* Child transaction / save point */
+ size_t meta_track_alloc; /* Currently allocated */
+ int meta_track_nest; /* Nesting level of meta transaction */
+#define WT_META_TRACKING(session) (session->meta_track_next != NULL)
+
+ TAILQ_HEAD(__tables, __wt_table) tables;
+
+ WT_ITEM **scratch; /* Temporary memory for any function */
+ u_int scratch_alloc; /* Currently allocated */
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * It's hard to figure out from where a buffer was allocated after it's
+ * leaked, so in diagnostic mode we track them; DIAGNOSTIC can't simply
+ * add additional fields to WT_ITEM structures because they are visible
+ * to applications, create a parallel structure instead.
+ */
+ struct __wt_scratch_track {
+ const char *file; /* Allocating file, line */
+ int line;
+ } *scratch_track;
+#endif
+
+ WT_TXN_ISOLATION isolation;
+ WT_TXN txn; /* Transaction state */
+ u_int ncursors; /* Count of active file cursors. */
+
+ WT_REF **excl; /* Eviction exclusive list */
+ u_int excl_next; /* Next empty slot */
+ size_t excl_allocated; /* Bytes allocated */
+
+ void *block_manager; /* Block-manager support */
+ int (*block_manager_cleanup)(WT_SESSION_IMPL *);
+
+ WT_DATA_HANDLE **ckpt_handle; /* Checkpoint support */
+ u_int ckpt_handle_next; /* Next empty slot */
+ size_t ckpt_handle_allocated; /* Bytes allocated */
+
+ void *reconcile; /* Reconciliation support */
+ int (*reconcile_cleanup)(WT_SESSION_IMPL *);
+
+ int compaction; /* Compaction did some work */
+
+ /*
+ * The split stash memory and hazard information persist past session
+ * close, because they are accessed by threads of control other than
+ * the thread owning the session. They live at the end of the
+ * structure so it's somewhat easier to clear everything but the fields
+ * that persist.
+ */
+#define WT_SESSION_CLEAR_SIZE(s) \
+ (WT_PTRDIFF(&(s)->flags, s) + sizeof((s)->flags))
+ uint32_t flags;
+
+ /*
+ * Splits can "free" memory that may still be in use, and we use a
+ * split generation number to track it, that is, the session stores a
+ * reference to the memory and allocates a split generation; when no
+ * session is reading from that split generation, the memory can be
+ * freed for real.
+ */
+ struct __wt_split_stash {
+ uint64_t split_gen; /* Split generation */
+ void *p; /* Memory, length */
+ size_t len;
+ } *split_stash; /* Split stash array */
+ size_t split_stash_cnt; /* Array entries */
+ size_t split_stash_alloc; /* Allocated bytes */
+
+ uint64_t split_gen; /* Reading split generation */
+
+ /*
+ * Hazard pointers.
+ * The number of hazard pointers that can be in use grows dynamically.
+ */
+#define WT_HAZARD_INCR 10
+ uint32_t hazard_size; /* Allocated slots in hazard array. */
+ uint32_t nhazard; /* Count of active hazard pointers */
+ WT_HAZARD *hazard; /* Hazard pointer array */
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
new file mode 100644
index 00000000000..11f42ac5500
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -0,0 +1,332 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+struct __wt_stats {
+ const char *desc; /* text description */
+ uint64_t v; /* 64-bit value */
+};
+
+/*
+ * Read/write statistics without any test for statistics configuration.
+ */
+#define WT_STAT(stats, fld) \
+ ((stats)->fld.v)
+#define WT_STAT_ATOMIC_DECRV(stats, fld, value) do { \
+ (void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value)); \
+} while (0)
+#define WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1)
+#define WT_STAT_ATOMIC_INCRV(stats, fld, value) do { \
+ (void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value)); \
+} while (0)
+#define WT_STAT_ATOMIC_INCR(stats, fld) WT_ATOMIC_ADD(WT_STAT(stats, fld), 1)
+#define WT_STAT_DECRV(stats, fld, value) do { \
+ (stats)->fld.v -= (value); \
+} while (0)
+#define WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1)
+#define WT_STAT_INCRV(stats, fld, value) do { \
+ (stats)->fld.v += (value); \
+} while (0)
+#define WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1)
+#define WT_STAT_SET(stats, fld, value) do { \
+ (stats)->fld.v = (uint64_t)(value); \
+} while (0)
+
+/*
+ * Read/write statistics if "fast" statistics are configured.
+ */
+#define WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_ATOMIC_DECRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_ATOMIC_DECR(session, stats, fld) \
+ WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1)
+#define WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_ATOMIC_INCRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_ATOMIC_INCR(session, stats, fld) \
+ WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1)
+#define WT_STAT_FAST_DECRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_DECRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_DECR(session, stats, fld) \
+ WT_STAT_FAST_DECRV(session, stats, fld, 1)
+#define WT_STAT_FAST_INCRV(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_INCRV(stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_INCR(session, stats, fld) \
+ WT_STAT_FAST_INCRV(session, stats, fld, 1)
+#define WT_STAT_FAST_SET(session, stats, fld, value) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ WT_STAT_SET(stats, fld, value); \
+} while (0)
+
+/*
+ * Read/write connection handle statistics if "fast" statistics are configured.
+ */
+#define WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value) \
+ WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld) \
+ WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value) \
+ WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld) \
+ WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_DECR(session, fld) \
+ WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_DECRV(session, fld, value) \
+ WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_INCR(session, fld) \
+ WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld)
+#define WT_STAT_FAST_CONN_INCRV(session, fld, value) \
+ WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value)
+#define WT_STAT_FAST_CONN_SET(session, fld, value) \
+ WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value)
+
+/*
+ * Read/write data-source handle statistics if the data-source handle is set
+ * and "fast" statistics are configured.
+ *
+ * XXX
+ * We shouldn't have to check if the data-source handle is NULL, but it's
+ * useful until everything is converted to using data-source handles.
+ */
+#define WT_STAT_FAST_DATA_DECRV(session, fld, value) do { \
+ if ((session)->dhandle != NULL) \
+ WT_STAT_FAST_DECRV( \
+ session, &(session)->dhandle->stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_DATA_DECR(session, fld) \
+ WT_STAT_FAST_DATA_DECRV(session, fld, 1)
+#define WT_STAT_FAST_DATA_INCRV(session, fld, value) do { \
+ if ((session)->dhandle != NULL) \
+ WT_STAT_FAST_INCRV( \
+ session, &(session)->dhandle->stats, fld, value); \
+} while (0)
+#define WT_STAT_FAST_DATA_INCR(session, fld) \
+ WT_STAT_FAST_DATA_INCRV(session, fld, 1)
+#define WT_STAT_FAST_DATA_SET(session, fld, value) do { \
+ if ((session)->dhandle != NULL) \
+ WT_STAT_FAST_SET( \
+ session, &(session)->dhandle->stats, fld, value); \
+} while (0)
+
+/*
+ * DO NOT EDIT: automatically built by dist/stat.py.
+ */
+/* Statistics section: BEGIN */
+
+/*
+ * Statistics entries for connections.
+ */
+#define WT_CONNECTION_STATS_BASE 1000
+struct __wt_connection_stats {
+ WT_STATS async_alloc_race;
+ WT_STATS async_alloc_view;
+ WT_STATS async_cur_queue;
+ WT_STATS async_flush;
+ WT_STATS async_full;
+ WT_STATS async_max_queue;
+ WT_STATS async_nowork;
+ WT_STATS async_op_alloc;
+ WT_STATS async_op_compact;
+ WT_STATS async_op_insert;
+ WT_STATS async_op_remove;
+ WT_STATS async_op_search;
+ WT_STATS async_op_update;
+ WT_STATS block_byte_map_read;
+ WT_STATS block_byte_read;
+ WT_STATS block_byte_write;
+ WT_STATS block_map_read;
+ WT_STATS block_preload;
+ WT_STATS block_read;
+ WT_STATS block_write;
+ WT_STATS cache_bytes_dirty;
+ WT_STATS cache_bytes_inuse;
+ WT_STATS cache_bytes_max;
+ WT_STATS cache_bytes_read;
+ WT_STATS cache_bytes_write;
+ WT_STATS cache_eviction_checkpoint;
+ WT_STATS cache_eviction_clean;
+ WT_STATS cache_eviction_deepen;
+ WT_STATS cache_eviction_dirty;
+ WT_STATS cache_eviction_fail;
+ WT_STATS cache_eviction_force;
+ WT_STATS cache_eviction_force_fail;
+ WT_STATS cache_eviction_hazard;
+ WT_STATS cache_eviction_internal;
+ WT_STATS cache_eviction_queue_empty;
+ WT_STATS cache_eviction_queue_not_empty;
+ WT_STATS cache_eviction_server_evicting;
+ WT_STATS cache_eviction_server_not_evicting;
+ WT_STATS cache_eviction_slow;
+ WT_STATS cache_eviction_split;
+ WT_STATS cache_eviction_walk;
+ WT_STATS cache_pages_dirty;
+ WT_STATS cache_pages_inuse;
+ WT_STATS cache_read;
+ WT_STATS cache_write;
+ WT_STATS cond_wait;
+ WT_STATS cursor_create;
+ WT_STATS cursor_insert;
+ WT_STATS cursor_next;
+ WT_STATS cursor_prev;
+ WT_STATS cursor_remove;
+ WT_STATS cursor_reset;
+ WT_STATS cursor_search;
+ WT_STATS cursor_search_near;
+ WT_STATS cursor_update;
+ WT_STATS dh_session_handles;
+ WT_STATS dh_session_sweeps;
+ WT_STATS file_open;
+ WT_STATS log_buffer_grow;
+ WT_STATS log_buffer_size;
+ WT_STATS log_bytes_user;
+ WT_STATS log_bytes_written;
+ WT_STATS log_close_yields;
+ WT_STATS log_max_filesize;
+ WT_STATS log_reads;
+ WT_STATS log_scan_records;
+ WT_STATS log_scan_rereads;
+ WT_STATS log_scans;
+ WT_STATS log_slot_closes;
+ WT_STATS log_slot_consolidated;
+ WT_STATS log_slot_joins;
+ WT_STATS log_slot_races;
+ WT_STATS log_slot_switch_fails;
+ WT_STATS log_slot_toobig;
+ WT_STATS log_slot_toosmall;
+ WT_STATS log_slot_transitions;
+ WT_STATS log_sync;
+ WT_STATS log_writes;
+ WT_STATS lsm_checkpoint_throttle;
+ WT_STATS lsm_merge_throttle;
+ WT_STATS lsm_rows_merged;
+ WT_STATS lsm_work_queue_app;
+ WT_STATS lsm_work_queue_manager;
+ WT_STATS lsm_work_queue_max;
+ WT_STATS lsm_work_queue_switch;
+ WT_STATS lsm_work_units_created;
+ WT_STATS lsm_work_units_discarded;
+ WT_STATS lsm_work_units_done;
+ WT_STATS memory_allocation;
+ WT_STATS memory_free;
+ WT_STATS memory_grow;
+ WT_STATS read_io;
+ WT_STATS rec_pages;
+ WT_STATS rec_pages_eviction;
+ WT_STATS rec_split_stashed_bytes;
+ WT_STATS rec_split_stashed_objects;
+ WT_STATS rwlock_read;
+ WT_STATS rwlock_write;
+ WT_STATS session_cursor_open;
+ WT_STATS session_open;
+ WT_STATS txn_begin;
+ WT_STATS txn_checkpoint;
+ WT_STATS txn_checkpoint_running;
+ WT_STATS txn_commit;
+ WT_STATS txn_fail_cache;
+ WT_STATS txn_pinned_range;
+ WT_STATS txn_rollback;
+ WT_STATS write_io;
+};
+
+/*
+ * Statistics entries for data sources.
+ */
+#define WT_DSRC_STATS_BASE 2000
+struct __wt_dsrc_stats {
+ WT_STATS allocation_size;
+ WT_STATS block_alloc;
+ WT_STATS block_checkpoint_size;
+ WT_STATS block_extension;
+ WT_STATS block_free;
+ WT_STATS block_magic;
+ WT_STATS block_major;
+ WT_STATS block_minor;
+ WT_STATS block_reuse_bytes;
+ WT_STATS block_size;
+ WT_STATS bloom_count;
+ WT_STATS bloom_false_positive;
+ WT_STATS bloom_hit;
+ WT_STATS bloom_miss;
+ WT_STATS bloom_page_evict;
+ WT_STATS bloom_page_read;
+ WT_STATS bloom_size;
+ WT_STATS btree_column_deleted;
+ WT_STATS btree_column_fix;
+ WT_STATS btree_column_internal;
+ WT_STATS btree_column_variable;
+ WT_STATS btree_compact_rewrite;
+ WT_STATS btree_entries;
+ WT_STATS btree_fixed_len;
+ WT_STATS btree_maximum_depth;
+ WT_STATS btree_maxintlitem;
+ WT_STATS btree_maxintlpage;
+ WT_STATS btree_maxleafitem;
+ WT_STATS btree_maxleafpage;
+ WT_STATS btree_overflow;
+ WT_STATS btree_row_internal;
+ WT_STATS btree_row_leaf;
+ WT_STATS cache_bytes_read;
+ WT_STATS cache_bytes_write;
+ WT_STATS cache_eviction_checkpoint;
+ WT_STATS cache_eviction_clean;
+ WT_STATS cache_eviction_dirty;
+ WT_STATS cache_eviction_fail;
+ WT_STATS cache_eviction_hazard;
+ WT_STATS cache_eviction_internal;
+ WT_STATS cache_overflow_value;
+ WT_STATS cache_read;
+ WT_STATS cache_read_overflow;
+ WT_STATS cache_write;
+ WT_STATS compress_raw_fail;
+ WT_STATS compress_raw_fail_temporary;
+ WT_STATS compress_raw_ok;
+ WT_STATS compress_read;
+ WT_STATS compress_write;
+ WT_STATS compress_write_fail;
+ WT_STATS compress_write_too_small;
+ WT_STATS cursor_create;
+ WT_STATS cursor_insert;
+ WT_STATS cursor_insert_bulk;
+ WT_STATS cursor_insert_bytes;
+ WT_STATS cursor_next;
+ WT_STATS cursor_prev;
+ WT_STATS cursor_remove;
+ WT_STATS cursor_remove_bytes;
+ WT_STATS cursor_reset;
+ WT_STATS cursor_search;
+ WT_STATS cursor_search_near;
+ WT_STATS cursor_update;
+ WT_STATS cursor_update_bytes;
+ WT_STATS lsm_checkpoint_throttle;
+ WT_STATS lsm_chunk_count;
+ WT_STATS lsm_generation_max;
+ WT_STATS lsm_lookup_no_bloom;
+ WT_STATS lsm_merge_throttle;
+ WT_STATS rec_dictionary;
+ WT_STATS rec_multiblock_internal;
+ WT_STATS rec_multiblock_leaf;
+ WT_STATS rec_multiblock_max;
+ WT_STATS rec_overflow_key_internal;
+ WT_STATS rec_overflow_key_leaf;
+ WT_STATS rec_overflow_value;
+ WT_STATS rec_page_delete;
+ WT_STATS rec_page_match;
+ WT_STATS rec_pages;
+ WT_STATS rec_pages_eviction;
+ WT_STATS rec_prefix_compression;
+ WT_STATS rec_suffix_compression;
+ WT_STATS session_compact;
+ WT_STATS session_cursor_open;
+ WT_STATS txn_update_conflict;
+};
+
+/* Statistics section: END */
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
new file mode 100644
index 00000000000..c28a9231750
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define WT_TXN_NONE 0 /* No txn running in a session. */
+#define WT_TXN_ABORTED UINT64_MAX /* Update rolled back, ignore. */
+
+/*
+ * Transaction ID comparison dealing with edge cases.
+ *
+ * WT_TXN_ABORTED is the largest possible ID (never visible to a running
+ * transaction), WT_TXN_NONE is smaller than any possible ID (visible to all
+ * running transactions).
+ */
+#define TXNID_LE(t1, t2) \
+ ((t1) <= (t2))
+
+#define TXNID_LT(t1, t2) \
+ ((t1) != (t2) && TXNID_LE(t1, t2))
+
+#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id])
+
+struct __wt_txn_state {
+ volatile uint64_t id;
+ volatile uint64_t snap_min;
+} WT_GCC_ATTRIBUTE((aligned(WT_CACHE_LINE_ALIGNMENT)));
+
+struct __wt_txn_global {
+ volatile uint64_t current; /* Current transaction ID. */
+
+ /* The oldest running transaction ID (may race). */
+ uint64_t last_running;
+
+ /*
+ * The oldest transaction ID that is not yet visible to some
+ * transaction in the system.
+ */
+ volatile uint64_t oldest_id;
+
+ /* The oldest session found in the last scan. */
+ uint32_t oldest_session;
+
+ /* Count of scanning threads, or -1 for exclusive access. */
+ volatile int32_t scan_count;
+
+ WT_TXN_STATE *states; /* Per-session transaction states */
+};
+
+typedef enum __wt_txn_isolation {
+ TXN_ISO_EVICTION, /* Internal: eviction context */
+ TXN_ISO_READ_UNCOMMITTED,
+ TXN_ISO_READ_COMMITTED,
+ TXN_ISO_SNAPSHOT
+} WT_TXN_ISOLATION;
+
+/*
+ * WT_TXN_OP --
+ * A transactional operation. Each transaction builds an in-memory array
+ * of these operations as it runs, then uses the array to either write log
+ * records during commit or undo the operations during rollback.
+ */
+struct __wt_txn_op {
+ uint32_t fileid;
+ enum {
+ TXN_OP_BASIC,
+ TXN_OP_INMEM,
+ TXN_OP_REF,
+ TXN_OP_TRUNCATE_COL,
+ TXN_OP_TRUNCATE_ROW
+ } type;
+ union {
+ /* TXN_OP_BASIC, TXN_OP_INMEM */
+ WT_UPDATE *upd;
+ /* TXN_OP_REF */
+ WT_REF *ref;
+ /* TXN_OP_TRUNCATE_COL */
+ struct {
+ uint64_t start, stop;
+ } truncate_col;
+ /* TXN_OP_TRUNCATE_ROW */
+ struct {
+ WT_ITEM start, stop;
+ enum {
+ TXN_TRUNC_ALL,
+ TXN_TRUNC_BOTH,
+ TXN_TRUNC_START,
+ TXN_TRUNC_STOP
+ } mode;
+ } truncate_row;
+ } u;
+};
+
+/*
+ * WT_TXN --
+ * Per-session transaction context.
+ */
+struct __wt_txn {
+ uint64_t id;
+
+ WT_TXN_ISOLATION isolation;
+
+ /*
+ * Snapshot data:
+ * ids < snap_min are visible,
+ * ids > snap_max are invisible,
+ * everything else is visible unless it is in the snapshot.
+ */
+ uint64_t snap_min, snap_max;
+ uint64_t *snapshot;
+ uint32_t snapshot_count;
+ uint32_t txn_logsync; /* Log sync configuration */
+
+ /* Array of modifications by this transaction. */
+ WT_TXN_OP *mod;
+ size_t mod_alloc;
+ u_int mod_count;
+
+ /* Scratch buffer for in-memory log records. */
+ WT_ITEM *logrec;
+
+ /* Requested notification when transactions are resolved. */
+ WT_TXN_NOTIFY *notify;
+
+ /* Checkpoint status. */
+ WT_LSN ckpt_lsn;
+ int full_ckpt;
+ uint32_t ckpt_nsnapshot;
+ WT_ITEM *ckpt_snapshot;
+
+#define TXN_AUTOCOMMIT 0x01
+#define TXN_ERROR 0x02
+#define TXN_HAS_ID 0x04
+#define TXN_HAS_SNAPSHOT 0x08
+#define TXN_RUNNING 0x10
+ uint32_t flags;
+};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
new file mode 100644
index 00000000000..127176c67ea
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -0,0 +1,382 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+static inline int __wt_txn_id_check(WT_SESSION_IMPL *session);
+static inline void __wt_txn_read_last(WT_SESSION_IMPL *session);
+
+/*
+ * __txn_next_op --
+ * Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ *opp = NULL;
+
+ /*
+ * We're about to perform an update.
+ * Make sure we have allocated a transaction ID.
+ */
+ WT_RET(__wt_txn_id_check(session));
+ WT_ASSERT(session, F_ISSET(txn, TXN_HAS_ID));
+
+ WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
+ txn->mod_count + 1, &txn->mod));
+
+ *opp = &txn->mod[txn->mod_count++];
+ WT_CLEAR(**opp);
+ (*opp)->fileid = S2BT(session)->id;
+ return (0);
+}
+
+/*
+ * __wt_txn_unmodify --
+ * If threads race making updates, they may discard the last referenced
+ * WT_UPDATE item while the transaction is still active. This function
+ * removes the last update item from the "log".
+ */
+static inline void
+__wt_txn_unmodify(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ if (F_ISSET(txn, TXN_HAS_ID)) {
+ WT_ASSERT(session, txn->mod_count > 0);
+ txn->mod_count--;
+ }
+}
+
+/*
+ * __wt_txn_modify --
+ * Mark a WT_UPDATE object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_DECL_RET;
+ WT_TXN_OP *op;
+
+ WT_RET(__txn_next_op(session, &op));
+ op->type = F_ISSET(session, WT_SESSION_LOGGING_INMEM) ?
+ TXN_OP_INMEM : TXN_OP_BASIC;
+ op->u.upd = upd;
+ upd->txnid = session->txn.id;
+ return (ret);
+}
+
+/*
+ * __wt_txn_modify_ref --
+ * Remember a WT_REF object modified by the current transaction.
+ */
+static inline int
+__wt_txn_modify_ref(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_TXN_OP *op;
+
+ WT_RET(__txn_next_op(session, &op));
+ op->type = TXN_OP_REF;
+ op->u.ref = ref;
+ return (__wt_txn_log_op(session, NULL));
+}
+
+/*
+ * __wt_txn_visible_all --
+ * Check if a given transaction ID is "globally visible". This is, if
+ * all sessions in the system will see the transaction ID.
+ */
+static inline int
+__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
+{
+ uint64_t oldest_id;
+
+ oldest_id = S2C(session)->txn_global.oldest_id;
+ return (TXNID_LT(id, oldest_id));
+}
+
+/*
+ * __wt_txn_visible --
+ * Can the current transaction see the given ID?
+ */
+static inline int
+__wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ /*
+ * Eviction only sees globally visible updates, or if there is a
+ * checkpoint transaction running, use its transaction.
+ */
+ if (txn->isolation == TXN_ISO_EVICTION)
+ return (__wt_txn_visible_all(session, id));
+
+ /* Nobody sees the results of aborted transactions. */
+ if (id == WT_TXN_ABORTED)
+ return (0);
+
+ /* Changes with no associated transaction are always visible. */
+ if (id == WT_TXN_NONE)
+ return (1);
+
+ /*
+ * Read-uncommitted transactions see all other changes.
+ *
+ * All metadata reads are at read-uncommitted isolation. That's
+ * because once a schema-level operation completes, subsequent
+ * operations must see the current version of checkpoint metadata, or
+ * they may try to read blocks that may have been freed from a file.
+ * Metadata updates use non-transactional techniques (such as the
+ * schema and metadata locks) to protect access to in-flight updates.
+ */
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED ||
+ S2BT_SAFE(session) == session->metafile)
+ return (1);
+
+ /* Transactions see their own changes. */
+ if (id == txn->id)
+ return (1);
+
+ /*
+ * TXN_ISO_SNAPSHOT, TXN_ISO_READ_COMMITTED: the ID is visible if it is
+ * not the result of a concurrent transaction, that is, if was
+ * committed before the snapshot was taken.
+ *
+ * The order here is important: anything newer than the maximum ID we
+ * saw when taking the snapshot should be invisible, even if the
+ * snapshot is empty.
+ */
+ if (TXNID_LE(txn->snap_max, id))
+ return (0);
+ if (txn->snapshot_count == 0 || TXNID_LT(id, txn->snap_min))
+ return (1);
+
+ return (bsearch(&id, txn->snapshot, txn->snapshot_count,
+ sizeof(uint64_t), __wt_txnid_cmp) == NULL);
+}
+
+/*
+ * __wt_txn_read --
+ * Get the first visible update in a list (or NULL if none are visible).
+ */
+static inline WT_UPDATE *
+__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ while (upd != NULL && !__wt_txn_visible(session, upd->txnid))
+ upd = upd->next;
+
+ return (upd);
+}
+
+/*
+ * __wt_txn_autocommit_check --
+ * If an auto-commit transaction is required, start one.
+*/
+static inline int
+__wt_txn_autocommit_check(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ if (F_ISSET(txn, TXN_AUTOCOMMIT)) {
+ F_CLR(txn, TXN_AUTOCOMMIT);
+ return (__wt_txn_begin(session, NULL));
+ }
+ return (0);
+}
+
+/*
+ * __wt_txn_new_id --
+ * Allocate a new transaction ID.
+ */
+static inline uint64_t
+__wt_txn_new_id(WT_SESSION_IMPL *session)
+{
+ /*
+ * We want the global value to lead the allocated values, so that any
+ * allocated transaction ID eventually becomes globally visible. When
+ * there are no transactions running, the oldest_id will reach the
+ * global current ID, so we want post-increment semantics. Our atomic
+ * add primitive does pre-increment, so adjust the result here.
+ */
+ return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1);
+}
+
+/*
+ * __wt_txn_id_check --
+ * A transaction is going to do an update, start an auto commit
+ * transaction if required and allocate a transaction ID.
+ */
+static inline int
+__wt_txn_id_check(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+
+ WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
+ if (!F_ISSET(txn, TXN_HAS_ID)) {
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
+
+ /*
+ * Allocate a transaction ID.
+ *
+ * We use an atomic compare and swap to ensure that we get a
+ * unique ID that is published before the global counter is
+ * updated.
+ *
+ * If two threads race to allocate an ID, only the latest ID
+ * will proceed. The winning thread can be sure its snapshot
+ * contains all of the earlier active IDs. Threads that race
+ * and get an earlier ID may not appear in the snapshot, but
+ * they will loop and allocate a new ID before proceeding to
+ * make any updates.
+ *
+ * This potentially wastes transaction IDs when threads race to
+ * begin transactions: that is the price we pay to keep this
+ * path latch free.
+ */
+ do {
+ txn_state->id = txn->id = txn_global->current;
+ } while (!WT_ATOMIC_CAS8(
+ txn_global->current, txn->id, txn->id + 1));
+
+ /*
+ * If we have used 64-bits of transaction IDs, there is nothing
+ * more we can do.
+ */
+ if (txn->id == WT_TXN_ABORTED)
+ WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");
+ F_SET(txn, TXN_HAS_ID);
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_txn_update_check --
+ * Check if the current transaction can update an item.
+ */
+static inline int
+__wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ if (txn->isolation == TXN_ISO_SNAPSHOT)
+ while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) {
+ if (upd->txnid != WT_TXN_ABORTED) {
+ WT_STAT_FAST_DATA_INCR(
+ session, txn_update_conflict);
+ return (WT_ROLLBACK);
+ }
+ upd = upd->next;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_txn_read_last --
+ * Called when the last page for a session is released.
+ */
+static inline void
+__wt_txn_read_last(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ /* Release the snap_min ID we put in the global table. */
+ if (!F_ISSET(txn, TXN_RUNNING) ||
+ txn->isolation != TXN_ISO_SNAPSHOT)
+ __wt_txn_release_snapshot(session);
+}
+
+/*
+ * __wt_txn_cursor_op --
+ * Called for each cursor operation.
+ */
+static inline void
+__wt_txn_cursor_op(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ /*
+ * If there is no transaction running (so we don't have an ID), and no
+ * snapshot allocated, put an ID in the global table to prevent any
+ * update that we are reading from being trimmed to save memory. Do a
+ * read before the write because this shared data is accessed a lot.
+ *
+ * !!!
+ * Note: We are updating the global table unprotected, so the
+ * oldest_id may move past this ID if a scan races with this
+ * value being published. That said, read-uncommitted operations
+ * always take the most recent version of a value, so for that version
+ * to be freed, two newer versions would have to be committed. Putting
+ * this snap_min ID in the table prevents the oldest ID from moving
+ * further forward, so that once a read-uncommitted cursor is
+ * positioned on a value, it can't be freed.
+ */
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED &&
+ !F_ISSET(txn, TXN_HAS_ID) &&
+ TXNID_LT(txn_state->snap_min, txn_global->last_running))
+ txn_state->snap_min = txn_global->last_running;
+
+ if (txn->isolation != TXN_ISO_READ_UNCOMMITTED &&
+ !F_ISSET(txn, TXN_HAS_SNAPSHOT))
+ __wt_txn_refresh(session, 1);
+}
+
+/*
+ * __wt_txn_am_oldest --
+ * Am I the oldest transaction in the system?
+ */
+static inline int
+__wt_txn_am_oldest(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ uint64_t id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+
+ if (txn->id == WT_TXN_NONE)
+ return (0);
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states;
+ i < session_cnt;
+ i++, s++)
+ if ((id = s->id) != WT_TXN_NONE &&
+ TXNID_LT(id, txn->id))
+ return (0);
+
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/include/verify_build.h b/src/third_party/wiredtiger/src/include/verify_build.h
new file mode 100644
index 00000000000..5f05db11c4b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/verify_build.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#undef ALIGN_CHECK
+#undef SIZE_CHECK
+
+/*
+ * NOTE: If you see a compile failure in this file, your compiler is laying out
+ * structs in memory in a way WiredTiger does not expect. Please refer to the
+ * build instructions in the documentation (docs/html/install.html) for more
+ * information.
+ */
+
+/*
+ * Compile time assertions.
+ *
+ * If the argument to WT_STATIC_ASSERT is zero, the macro evaluates to:
+ *
+ * (void)sizeof(char[-1])
+ *
+ * which fails to compile (which is what we want, the assertion failed).
+ * If the value of the argument to WT_STATIC_ASSERT is non-zero, then the
+ * macro evaluates to:
+ *
+ * (void)sizeof(char[1]);
+ *
+ * which compiles with no warnings, and produces no code.
+ *
+ * For more details about why this works, see
+ * http://scaryreasoner.wordpress.com/2009/02/28/
+ */
+#define WT_STATIC_ASSERT(cond) (void)sizeof(char[1 - 2 * !(cond)])
+
+#define SIZE_CHECK(type, e) do { \
+ char __check_##type[1 - 2 * !(sizeof(type) == (e))]; \
+ (void)__check_##type; \
+} while (0)
+
+#define ALIGN_CHECK(type, a) \
+ WT_STATIC_ASSERT(WT_ALIGN(sizeof(type), (a)) == sizeof(type))
+
+/*
+ * __wt_verify_build --
+ * This function is never called: it exists so there is a place for code
+ * that checks build-time conditions.
+ */
+static inline void
+__wt_verify_build(void)
+{
+ /* Check specific structures weren't padded. */
+ SIZE_CHECK(WT_BLOCK_DESC, WT_BLOCK_DESC_SIZE);
+ SIZE_CHECK(WT_REF, WT_REF_SIZE);
+
+ /*
+ * The btree code encodes key/value pairs in size_t's, and requires at
+ * least 8B size_t's.
+ */
+ WT_STATIC_ASSERT(sizeof(size_t) >= 8);
+
+ /*
+ * We require a wt_off_t fit into an 8B chunk because 8B is the largest
+ * integral value we can encode into an address cookie.
+ *
+ * WiredTiger has never been tested on a system with 4B file offsets,
+ * disallow them for now.
+ */
+ WT_STATIC_ASSERT(sizeof(wt_off_t) == 8);
+}
+
+#undef ALIGN_CHECK
+#undef SIZE_CHECK
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
new file mode 100644
index 00000000000..09cbca89f17
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -0,0 +1,3463 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#ifndef __WIREDTIGER_H_
+#define __WIREDTIGER_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************
+ * Version information
+ *******************************************/
+#define WIREDTIGER_VERSION_MAJOR @VERSION_MAJOR@
+#define WIREDTIGER_VERSION_MINOR @VERSION_MINOR@
+#define WIREDTIGER_VERSION_PATCH @VERSION_PATCH@
+#define WIREDTIGER_VERSION_STRING @VERSION_STRING@
+
+/*******************************************
+ * Required includes
+ *******************************************/
+@wiredtiger_includes_decl@
+
+/*******************************************
+ * Portable type names
+ *******************************************/
+@off_t_decl@
+@uintmax_t_decl@
+@uintptr_t_decl@
+
+#if defined(DOXYGEN) || defined(SWIG)
+#define __F(func) func
+#else
+#define __F(func) (*func)
+#endif
+
+#ifdef SWIG
+%{
+#include <wiredtiger.h>
+%}
+#endif
+
+/*!
+ * @defgroup wt WiredTiger API
+ * The functions, handles and methods applications use to access and manage
+ * data with WiredTiger.
+ *
+ * @{
+ */
+
+/*******************************************
+ * Public forward structure declarations
+ *******************************************/
+struct __wt_async_callback;
+ typedef struct __wt_async_callback WT_ASYNC_CALLBACK;
+struct __wt_async_op; typedef struct __wt_async_op WT_ASYNC_OP;
+struct __wt_collator; typedef struct __wt_collator WT_COLLATOR;
+struct __wt_compressor; typedef struct __wt_compressor WT_COMPRESSOR;
+struct __wt_config_item; typedef struct __wt_config_item WT_CONFIG_ITEM;
+struct __wt_config_parser;
+ typedef struct __wt_config_parser WT_CONFIG_PARSER;
+struct __wt_connection; typedef struct __wt_connection WT_CONNECTION;
+struct __wt_cursor; typedef struct __wt_cursor WT_CURSOR;
+struct __wt_data_source; typedef struct __wt_data_source WT_DATA_SOURCE;
+struct __wt_event_handler; typedef struct __wt_event_handler WT_EVENT_HANDLER;
+struct __wt_extension_api; typedef struct __wt_extension_api WT_EXTENSION_API;
+struct __wt_extractor; typedef struct __wt_extractor WT_EXTRACTOR;
+struct __wt_item; typedef struct __wt_item WT_ITEM;
+struct __wt_lsn; typedef struct __wt_lsn WT_LSN;
+struct __wt_session; typedef struct __wt_session WT_SESSION;
+
+#if defined(SWIGJAVA)
+#define WT_HANDLE_NULLABLE(typename) typename##_NULLABLE
+#define WT_HANDLE_CLOSED(typename) typename##_CLOSED
+typedef WT_CURSOR WT_CURSOR_NULLABLE;
+typedef WT_CURSOR WT_CURSOR_CLOSED;
+typedef WT_SESSION WT_SESSION_CLOSED;
+typedef WT_CONNECTION WT_CONNECTION_CLOSED;
+#elif !defined(DOXYGEN)
+#define WT_HANDLE_NULLABLE(typename) typename
+#define WT_HANDLE_CLOSED(typename) typename
+#endif
+
+/*!
+ * A raw item of data to be managed, including a pointer to the data and a
+ * length.
+ *
+ * WT_ITEM structures do not need to be cleared before use.
+ */
+struct __wt_item {
+ /*!
+ * The memory reference of the data item.
+ *
+ * For items returned by a WT_CURSOR, the pointer is only valid until
+ * the next operation on that cursor. Applications that need to keep
+ * an item across multiple cursor operations must make a copy.
+ */
+ const void *data;
+
+ /*!
+ * The number of bytes in the data item.
+ *
+ * The maximum length of a single column stored in a table is not fixed
+ * (as it partially depends on the underlying file configuration), but
+ * is always a small number of bytes less than 4GB.
+ */
+ size_t size;
+
+#ifndef DOXYGEN
+#define WT_ITEM_ALIGNED 0x00000001
+#define WT_ITEM_INUSE 0x00000002
+ /* This appears in the middle of the struct to avoid padding. */
+ /*! Object flags (internal use). */
+ uint32_t flags;
+
+ /*! Managed memory chunk (internal use). */
+ void *mem;
+ /*! Managed memory size (internal use). */
+ size_t memsize;
+#endif
+};
+
+/*
+ * We rely on this structure being aligned at 64 bits by the compiler,
+ * if we were paranoid we could add an unused field to ensure the padding
+ * is correct.
+ *
+ * NOTE: If you change the contents of this structure you must also update
+ * the macros in log.h.
+ */
+/*!
+ * A log sequence number, representing a position in the transaction log.
+ */
+struct __wt_lsn {
+ uint32_t file; /*!< Log file number */
+ wt_off_t offset; /*!< Log file offset */
+};
+
+/*!
+ * The maximum packed size of a 64-bit integer. The ::wiredtiger_struct_pack
+ * function will pack single long integers into at most this many bytes.
+ */
+#define WT_INTPACK64_MAXSIZE ((int)sizeof (int64_t) + 1)
+
+/*!
+ * The maximum packed size of a 32-bit integer. The ::wiredtiger_struct_pack
+ * function will pack single integers into at most this many bytes.
+ */
+#define WT_INTPACK32_MAXSIZE ((int)sizeof (int32_t) + 1)
+
+/*!
+ * A WT_CURSOR handle is the interface to a cursor.
+ *
+ * Cursors allow data to be searched, iterated and modified, implementing the
+ * CRUD (create, read, update and delete) operations. Cursors are opened in
+ * the context of a session. If a transaction is started, cursors operate in
+ * the context of the transaction until the transaction is resolved.
+ *
+ * Raw data is represented by key/value pairs of WT_ITEM structures, but
+ * cursors can also provide access to fields within the key and value if the
+ * formats are described in the WT_SESSION::create method.
+ *
+ * In the common case, a cursor is used to access records in a table. However,
+ * cursors can be used on subsets of tables (such as a single column or a
+ * projection of multiple columns), as an interface to statistics, configuration
+ * data or application-specific data sources. See WT_SESSION::open_cursor for
+ * more information.
+ *
+ * <b>Thread safety:</b> A WT_CURSOR handle is not usually shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_cursor {
+ WT_SESSION *session; /*!< The session handle for this cursor. */
+
+ /*!
+ * The name of the data source for the cursor, matches the \c uri
+ * parameter to WT_SESSION::open_cursor used to open the cursor.
+ */
+ const char *uri;
+
+ /*!
+ * The format of the data packed into key items. See @ref packing for
+ * details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *key_format;
+
+ /*!
+ * The format of the data packed into value items. See @ref packing
+ * for details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *value_format;
+
+ /*!
+ * @name Data access
+ * @{
+ */
+ /*!
+ * Get the key for the current record.
+ *
+ * @snippet ex_all.c Get the cursor's string key
+ *
+ * @snippet ex_all.c Get the cursor's record number key
+ *
+ * @param cursor the cursor handle
+ * @param ... pointers to hold key fields corresponding to
+ * WT_CURSOR::key_format.
+ * @errors
+ */
+ int __F(get_key)(WT_CURSOR *cursor, ...);
+
+ /*!
+ * Get the value for the current record.
+ *
+ * @snippet ex_all.c Get the cursor's string value
+ *
+ * @snippet ex_all.c Get the cursor's raw value
+ *
+ * @param cursor the cursor handle
+ * @param ... pointers to hold value fields corresponding to
+ * WT_CURSOR::value_format.
+ * @errors
+ */
+ int __F(get_value)(WT_CURSOR *cursor, ...);
+
+ /*!
+ * Set the key for the next operation.
+ *
+ * @snippet ex_all.c Set the cursor's string key
+ *
+ * @snippet ex_all.c Set the cursor's record number key
+ *
+ * @param cursor the cursor handle
+ * @param ... key fields corresponding to WT_CURSOR::key_format.
+ *
+ * If an error occurs during this operation, a flag will be set in the
+ * cursor, and the next operation to access the key will fail. This
+ * simplifies error handling in applications.
+ */
+ void __F(set_key)(WT_CURSOR *cursor, ...);
+
+ /*!
+ * Set the value for the next operation.
+ *
+ * @snippet ex_all.c Set the cursor's string value
+ *
+ * @snippet ex_all.c Set the cursor's raw value
+ *
+ * @param cursor the cursor handle
+ * @param ... value fields corresponding to WT_CURSOR::value_format.
+ *
+ * If an error occurs during this operation, a flag will be set in the
+ * cursor, and the next operation to access the value will fail. This
+ * simplifies error handling in applications.
+ */
+ void __F(set_value)(WT_CURSOR *cursor, ...);
+ /*! @} */
+
+ /*!
+ * @name Cursor positioning
+ * @{
+ */
+ /*!
+ * Return the ordering relationship between two cursors: both cursors
+ * must have the same data source and have valid keys.
+ *
+ * @snippet ex_all.c Cursor comparison
+ *
+ * @param cursor the cursor handle
+ * @param other another cursor handle
+ * @param comparep the status of the comparison: < 0 if
+ * <code>cursor</code> refers to a key that appears before
+ * <code>other</code>, 0 if the cursors refer to the same key,
+ * and > 0 if <code>cursor</code> refers to a key that appears after
+ * <code>other</code>.
+ * @errors
+ */
+ int __F(compare)(WT_CURSOR *cursor, WT_CURSOR *other, int *comparep);
+
+ /*!
+ * Return the next record.
+ *
+ * @snippet ex_all.c Return the next record
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(next)(WT_CURSOR *cursor);
+
+ /*!
+ * Return the previous record.
+ *
+ * @snippet ex_all.c Return the previous record
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(prev)(WT_CURSOR *cursor);
+
+ /*!
+ * Reset the position of the cursor. Any resources held by the cursor
+ * are released, and the cursor's key and position are no longer valid.
+ * A subsequent iteration with WT_CURSOR::next will move to the first
+ * record, or with WT_CURSOR::prev will move to the last record.
+ *
+ * @snippet ex_all.c Reset the cursor
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(reset)(WT_CURSOR *cursor);
+
+ /*!
+ * Return the record matching the key. The key must first be set.
+ *
+ * @snippet ex_all.c Search for an exact match
+ *
+ * On success, the cursor ends positioned at the returned record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the record has been retrieved and the cursor no
+ * longer needs that position.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(search)(WT_CURSOR *cursor);
+
+ /*!
+ * Return the record matching the key if it exists, or an adjacent
+ * record. An adjacent record is either the smallest record larger
+ * than the key or the largest record smaller than the key (in other
+ * words, a logically adjacent key).
+ *
+ * The key must first be set.
+ *
+ * An example of a search for an exact or adjacent match:
+ *
+ * @snippet ex_all.c Search for an exact or adjacent match
+ *
+ * An example of a forward scan through the table, where all keys
+ * greater than or equal to a specified prefix are included in the
+ * scan:
+ *
+ * @snippet ex_all.c Forward scan greater than or equal
+ *
+ * An example of a backward scan through the table, where all keys
+ * less than a specified prefix are included in the scan:
+ *
+ * @snippet ex_all.c Backward scan less than
+ *
+ * On success, the cursor ends positioned at the returned record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the record has been retrieved and the cursor no
+ * longer needs that position.
+ *
+ * @param cursor the cursor handle
+ * @param exactp the status of the search: 0 if an exact match is
+ * found, < 0 if a smaller key is returned, > 0 if a larger key is
+ * returned
+ * @errors
+ */
+ int __F(search_near)(WT_CURSOR *cursor, int *exactp);
+ /*! @} */
+
+ /*!
+ * @name Data modification
+ * @{
+ */
+ /*!
+ * Insert a record and optionally update an existing record.
+ *
+ * If the cursor was configured with "overwrite=true" (the default),
+ * both the key and value must be set; if the record already exists,
+ * the key's value will be updated, otherwise, the record will be
+ * inserted.
+ *
+ * @snippet ex_all.c Insert a new record or overwrite an existing record
+ *
+ * If the cursor was not configured with "overwrite=true", both the key
+ * and value must be set and the record must not already exist; the
+ * record will be inserted.
+ *
+ * @snippet ex_all.c Insert a new record and fail if the record exists
+ *
+ * If a cursor with record number keys was configured with
+ * "append=true" (not the default), the value must be set; a new record
+ * will be appended and the record number set as the cursor key value.
+ *
+ * @snippet ex_all.c Insert a new record and assign a record number
+ *
+ * The cursor ends with no position, and a subsequent call to the
+ * WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the
+ * beginning (end) of the table.
+ *
+ * Inserting a new record after the current maximum record in a
+ * fixed-length bit field column-store (that is, a store with an
+ * 'r' type key and 't' type value) may implicitly create the missing
+ * records as records with a value of 0.
+ *
+ * When loading a large amount of data into a new object, using
+ * a cursor with the \c bulk configuration string enabled and
+ * loading the data in sorted order will be much faster than doing
+ * out-of-order inserts. See @ref tune_bulk_load for more information.
+ *
+ * The maximum length of a single column stored in a table is not fixed
+ * (as it partially depends on the underlying file configuration), but
+ * is always a small number of bytes less than 4GB.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ * In particular, if \c overwrite is not configured and a record with
+ * the specified key already exists, ::WT_DUPLICATE_KEY is returned.
+ */
+ int __F(insert)(WT_CURSOR *cursor);
+
+ /*!
+ * Update a record and optionally insert an existing record.
+ *
+ * If the cursor was configured with "overwrite=true" (the default),
+ * both the key and value must be set; if the record already exists, the
+ * key's value will be updated, otherwise, the record will be inserted.
+ *
+ * @snippet ex_all.c Update an existing record or insert a new record
+ *
+ * If the cursor was not configured with "overwrite=true", both the key
+ * and value must be set and the record must already exist; the
+ * record will be updated.
+ *
+ * @snippet ex_all.c Update an existing record and fail if DNE
+ *
+ * On success, the cursor ends positioned at the modified record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the cursor no longer needs that position.
+ *
+ * The maximum length of a single column stored in a table is not fixed
+ * (as it partially depends on the underlying file configuration), but
+ * is always a small number of bytes less than 4GB.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ * In particular, if \c overwrite is not configured and no record with
+ * the specified key exists, ::WT_NOTFOUND is returned.
+ */
+ int __F(update)(WT_CURSOR *cursor);
+
+ /*!
+ * Remove a record.
+ *
+ * If the cursor was configured with "overwrite=true" (the default),
+ * the key must be set; the key's record will be removed if it exists,
+ * no error will be returned if the record does not exist.
+ *
+ * @snippet ex_all.c Remove a record
+ *
+ * If the cursor was not configured with "overwrite=true", the key must
+ * be set and the key's record must exist; the record will be removed.
+ *
+ * @snippet ex_all.c Remove a record and fail if DNE
+ *
+ * Removing a record in a fixed-length bit field column-store
+ * (that is, a store with an 'r' type key and 't' type value) is
+ * identical to setting the record's value to 0.
+ *
+ * On success, the cursor ends positioned at the removed record; to
+ * minimize cursor resources, the WT_CURSOR::reset method should be
+ * called as soon as the cursor no longer needs that position.
+ *
+ * @param cursor the cursor handle
+ * @errors
+ * In particular, if \c overwrite is not configured and no record with
+ * the specified key exists, ::WT_NOTFOUND is returned.
+ */
+ int __F(remove)(WT_CURSOR *cursor);
+ /*! @} */
+
+ /*!
+ * Close the cursor.
+ *
+ * This releases the resources associated with the cursor handle.
+ * Cursors are closed implicitly by ending the enclosing connection or
+ * closing the session in which they were opened.
+ *
+ * @snippet ex_all.c Close the cursor
+ *
+ * @param cursor the cursor handle
+ * @errors
+ */
+ int __F(close)(WT_HANDLE_CLOSED(WT_CURSOR) *cursor);
+
+ /*
+ * Protected fields, only to be used by cursor implementations.
+ */
+#if !defined(SWIG) && !defined(DOXYGEN)
+ /*
+ * !!!
+ * Explicit representations of structures from queue.h.
+ * TAILQ_ENTRY(wt_cursor) q;
+ */
+ struct {
+ WT_CURSOR *tqe_next;
+ WT_CURSOR **tqe_prev;
+ } q; /* Linked list of WT_CURSORs. */
+
+ uint64_t recno; /* Record number, normal and raw mode */
+ uint8_t raw_recno_buf[WT_INTPACK64_MAXSIZE];
+
+ void *json_private; /* JSON specific storage */
+ void *lang_private; /* Language specific private storage */
+
+ WT_ITEM key, value;
+ int saved_err; /* Saved error in set_{key,value}. */
+ /*
+ * URI used internally, may differ from the URI provided by the
+ * user on open.
+ */
+ const char *internal_uri;
+
+#define WT_CURSTD_APPEND 0x0001
+#define WT_CURSTD_BULK 0x0002
+#define WT_CURSTD_DATA_SOURCE 0x0004
+#define WT_CURSTD_DUMP_HEX 0x0008
+#define WT_CURSTD_DUMP_JSON 0x0010
+#define WT_CURSTD_DUMP_PRINT 0x0020
+#define WT_CURSTD_KEY_EXT 0x0040 /* Key points out of the tree. */
+#define WT_CURSTD_KEY_INT 0x0080 /* Key points into the tree. */
+#define WT_CURSTD_KEY_SET (WT_CURSTD_KEY_EXT | WT_CURSTD_KEY_INT)
+#define WT_CURSTD_OPEN 0x0100
+#define WT_CURSTD_OVERWRITE 0x0200
+#define WT_CURSTD_RAW 0x0400
+#define WT_CURSTD_VALUE_EXT 0x0800 /* Value points out of the tree. */
+#define WT_CURSTD_VALUE_INT 0x1000 /* Value points into the tree. */
+#define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT)
+ uint32_t flags;
+#endif
+};
+
+/*! Asynchronous operation types. */
+typedef enum {
+ WT_AOP_NONE=0, /*!< No operation type set */
+ WT_AOP_COMPACT, /*!< WT_ASYNC_OP::compact */
+ WT_AOP_INSERT, /*!< WT_ASYNC_OP::insert */
+ WT_AOP_REMOVE, /*!< WT_ASYNC_OP::remove */
+ WT_AOP_SEARCH, /*!< WT_ASYNC_OP::search */
+ WT_AOP_UPDATE /*!< WT_ASYNC_OP::update */
+} WT_ASYNC_OPTYPE;
+
+/*!
+ * A WT_ASYNC_OP handle is the interface to an asynchronous operation.
+ *
+ * An asynchronous operation describes a data manipulation to be performed
+ * asynchronously by a WiredTiger worker thread. These operations implement
+ * the CRUD (create, read, update and delete) operations. Each operation
+ * is a self-contained work unit. The operation will be performed in the
+ * context of the worker thread's session. Each operation is performed
+ * within the context of a transaction. The application is notified of its
+ * completion with a callback. The transaction is resolved once the callback
+ * returns.
+ *
+ * The table referenced in an operation must already exist.
+ *
+ * Raw data is represented by key/value pairs of WT_ITEM structures, but
+ * operations can also provide access to fields within the key and value if
+ * the formats are described in the WT_SESSION::create method.
+ *
+ * <b>Thread safety:</b> A WT_ASYNC_OP handle may not be shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_async_op {
+ /*! The connection for this operation. */
+ WT_CONNECTION *connection;
+
+ /*!
+ * The format of the data packed into key items. See @ref packing for
+ * details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *key_format;
+
+ /*!
+ * The format of the data packed into value items. See @ref packing
+ * for details. If not set, a default value of "u" is assumed, and
+ * applications must use WT_ITEM structures to manipulate untyped byte
+ * arrays.
+ */
+ const char *value_format;
+
+ /*
+ * Don't expose app_private to non-C language bindings - they have
+ * their own way to attach data to an operation.
+ */
+#if !defined(SWIG)
+ /*!
+ * A location for applications to store information that will be
+ * available in the callback from an async operation.
+ */
+ void *app_private;
+#endif
+
+ /*!
+ * @name Data access
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_CURSOR::get_key method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns as described for WT_CURSOR::get_key
+ */
+ int __F(get_key)(WT_ASYNC_OP *op, ...);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::get_value method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns as described for WT_CURSOR::get_value
+ */
+ int __F(get_value)(WT_ASYNC_OP *op, ...);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::set_key method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ */
+ void __F(set_key)(WT_ASYNC_OP *op, ...);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::set_value method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ */
+ void __F(set_value)(WT_ASYNC_OP *op, ...);
+ /*! @} */
+
+ /*!
+ * @name Positioning
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_CURSOR::search method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::search
+ */
+ int __F(search)(WT_ASYNC_OP *op);
+ /*! @} */
+
+ /*!
+ * @name Data modification
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_CURSOR::insert method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::insert
+ */
+ int __F(insert)(WT_ASYNC_OP *op);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::update method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::update
+ */
+ int __F(update)(WT_ASYNC_OP *op);
+
+ /*!
+ * Invoke the underlying WT_CURSOR::remove method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_CURSOR::remove
+ */
+ int __F(remove)(WT_ASYNC_OP *op);
+ /*! @} */
+
+ /*!
+ * @name Table operations
+ * @{
+ */
+ /*!
+ * Invoke the underlying WT_SESSION::compact method; see that method
+ * for configuration, return and error values.
+ *
+ * @param op the operation handle
+ * @returns via the callback as described for WT_SESSION::compact
+ */
+ int __F(compact)(WT_ASYNC_OP *op);
+ /*! @} */
+
+ /*!
+ * Get the unique identifier for this operation.
+ *
+ * @snippet ex_async.c async get identifier
+ *
+ * @param op the operation handle
+ * @returns the id of the operation
+ */
+ uint64_t __F(get_id)(WT_ASYNC_OP *op);
+
+ /*!
+ * Get the type for this operation.
+ *
+ * @snippet ex_async.c async get type
+ *
+ * @param op the operation handle
+ * @returns the ::WT_ASYNC_OPTYPE of the operation
+ */
+ WT_ASYNC_OPTYPE __F(get_type)(WT_ASYNC_OP *op);
+
+ /*
+ * Protected fields, only to be used by internal implementation.
+ * Everything we need for maintaining the key/value is part of
+ * a cursor. So, include one here so that we can use the cursor
+ * functions to manage them.
+ */
+#if !defined(SWIG) && !defined(DOXYGEN)
+ WT_CURSOR c;
+#endif
+};
+
+/*!
+ * All data operations are performed in the context of a WT_SESSION. This
+ * encapsulates the thread and transactional context of the operation.
+ *
+ * <b>Thread safety:</b> A WT_SESSION handle is not usually shared between
+ * threads, see @ref threads for more information.
+ */
+struct __wt_session {
+ /*! The connection for this session. */
+ WT_CONNECTION *connection;
+
+ /*!
+ * Close the session handle.
+ *
+ * This will release the resources associated with the session handle,
+ * including rolling back any active transactions and closing any
+ * cursors that remain open in the session.
+ *
+ * @snippet ex_all.c Close a session
+ *
+ * @param session the session handle
+ * @configempty{session.close, see dist/api_data.py}
+ * @errors
+ */
+ int __F(close)(WT_HANDLE_CLOSED(WT_SESSION) *session,
+ const char *config);
+
+ /*!
+ * Reconfigure a session handle.
+ *
+ * @snippet ex_all.c Reconfigure a session
+ *
+ * WT_SESSION::reconfigure will fail if a transaction is in progress
+ * in the session.
+ *
+ * All cursors are reset.
+ *
+ * @param session the session handle
+ * @configstart{session.reconfigure, see dist/api_data.py}
+ * @config{isolation, the default isolation level for operations in this
+ * session., a string\, chosen from the following options: \c
+ * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
+ * read-committed.}
+ * @configend
+ * @errors
+ */
+ int __F(reconfigure)(WT_SESSION *session, const char *config);
+
+ /*!
+ * @name Cursor handles
+ * @{
+ */
+
+ /*!
+ * Open a new cursor on a data source or duplicate an existing cursor.
+ *
+ * @snippet ex_all.c Open a cursor
+ *
+ * An existing cursor can be duplicated by passing it as the \c to_dup
+ * parameter and setting the \c uri parameter to \c NULL:
+ *
+ * @snippet ex_all.c Duplicate a cursor
+ *
+ * Cursors being duplicated must have a key set, and successfully
+ * duplicated cursors are positioned at the same place in the data
+ * source as the original.
+ *
+ * To reconfigure a cursor, duplicate it with a new configuration value:
+ *
+ * @snippet ex_all.c Reconfigure a cursor
+ *
+ * Cursor handles should be discarded by calling WT_CURSOR::close.
+ *
+ * Cursors capable of supporting transactional operations operate in the
+ * context of the current transaction, if any.
+ *
+ * WT_SESSION::rollback_transaction implicitly resets all cursors.
+ *
+ * Cursors are relatively light-weight objects but may hold references
+ * to heavier-weight objects; applications should re-use cursors when
+ * possible, but instantiating new cursors is not so expensive that
+ * applications need to cache cursors at all cost.
+ *
+ * @param session the session handle
+ * @param uri the data source on which the cursor operates; cursors
+ * are usually opened on tables, however, cursors can be opened on
+ * any data source, regardless of whether it is ultimately stored
+ * in a table. Some cursor types may have limited functionality
+ * (for example, they may be read-only or not support transactional
+ * updates). See @ref data_sources for more information.
+ * <br>
+ * @copydoc doc_cursor_types
+ * @param to_dup a cursor to duplicate
+ * @configstart{session.open_cursor, see dist/api_data.py}
+ * @config{append, append the value as a new record\, creating a new
+ * record number key; valid only for cursors with record number keys., a
+ * boolean flag; default \c false.}
+ * @config{bulk, configure the cursor for bulk-loading\, a fast\,
+ * initial load path (see @ref tune_bulk_load for more information).
+ * Bulk-load may only be used for newly created objects and cursors
+ * configured for bulk-load only support the WT_CURSOR::insert and
+ * WT_CURSOR::close methods. When bulk-loading row-store objects\, keys
+ * must be loaded in sorted order. The value is usually a true/false
+ * flag; when bulk-loading fixed-length column store objects\, the
+ * special value \c bitmap allows chunks of a memory resident bitmap to
+ * be loaded directly into a file by passing a \c WT_ITEM to
+ * WT_CURSOR::set_value where the \c size field indicates the number of
+ * records in the bitmap (as specified by the object's \c value_format
+ * configuration). Bulk-loaded bitmap values must end on a byte boundary
+ * relative to the bit count (except for the last set of values
+ * loaded)., a string; default \c false.}
+ * @config{checkpoint, the name of a checkpoint to open (the reserved
+ * name "WiredTigerCheckpoint" opens the most recent internal checkpoint
+ * taken for the object). The cursor does not support data
+ * modification., a string; default empty.}
+ * @config{dump, configure the cursor for dump format inputs and
+ * outputs: "hex" selects a simple hexadecimal format\, "json" selects a
+ * JSON format with each record formatted as fields named by column
+ * names if available\, and "print" selects a format where only
+ * non-printing characters are hexadecimal encoded. These formats are
+ * compatible with the @ref util_dump and @ref util_load commands., a
+ * string\, chosen from the following options: \c "hex"\, \c "json"\, \c
+ * "print"; default empty.}
+ * @config{next_random, configure the cursor to return a pseudo-random
+ * record from the object; valid only for row-store cursors. Cursors
+ * configured with \c next_random=true only support the WT_CURSOR::next
+ * and WT_CURSOR::close methods. See @ref cursor_random for details., a
+ * boolean flag; default \c false.}
+ * @config{overwrite, configures whether the cursor's insert\, update
+ * and remove methods check the existing state of the record. If \c
+ * overwrite is \c false\, WT_CURSOR::insert fails with
+ * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and
+ * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not
+ * exist., a boolean flag; default \c true.}
+ * @config{raw, ignore the encodings for the key and value\, manage data
+ * as if the formats were \c "u". See @ref cursor_raw for details., a
+ * boolean flag; default \c false.}
+ * @config{readonly, only query operations are supported by this cursor.
+ * An error is returned if a modification is attempted using the cursor.
+ * The default is false for all cursor types except for log and metadata
+ * cursors., a boolean flag; default \c false.}
+ * @config{statistics, Specify the statistics to be gathered. Choosing
+ * "all" gathers statistics regardless of cost and may include
+ * traversing on-disk files; "fast" gathers a subset of relatively
+ * inexpensive statistics. The selection must agree with the database
+ * \c statistics configuration specified to ::wiredtiger_open or
+ * WT_CONNECTION::reconfigure. For example\, "all" or "fast" can be
+ * configured when the database is configured with "all"\, but the
+ * cursor open will fail if "all" is specified when the database is
+ * configured with "fast"\, and the cursor open will fail in all cases
+ * when the database is configured with "none". If \c statistics is not
+ * configured\, the default configuration is the database configuration.
+ * The "clear" configuration resets statistics after gathering them\,
+ * where appropriate (for example\, a cache size statistic is not
+ * cleared\, while the count of cursor insert operations will be
+ * cleared). See @ref statistics for more information., a list\, with
+ * values chosen from the following options: \c "all"\, \c "fast"\, \c
+ * "clear"; default empty.}
+ * @config{target, if non-empty\, backup the list of objects; valid only
+ * for a backup data source., a list of strings; default empty.}
+ * @configend
+ * @param[out] cursorp a pointer to the newly opened cursor
+ * @errors
+ */
+ int __F(open_cursor)(WT_SESSION *session,
+ const char *uri, WT_HANDLE_NULLABLE(WT_CURSOR) *to_dup,
+ const char *config, WT_CURSOR **cursorp);
+ /*! @} */
+
+ /*!
+ * @name Table operations
+ * @{
+ */
+ /*!
+ * Create a table, column group, index or file.
+ *
+ * @snippet ex_all.c Create a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to create, such as
+ * \c "table:stock". For a description of URI formats
+ * see @ref data_sources.
+ * @configstart{session.create, see dist/api_data.py}
+ * @config{allocation_size, the file unit allocation size\, in bytes\,
+ * must a power-of-two; smaller values decrease the file space required
+ * by overflow items\, and the default value of 4KB is a good choice
+ * absent requirements from the operating system or storage device., an
+ * integer between 512B and 128MB; default \c 4KB.}
+ * @config{app_metadata, application-owned metadata for this object., a
+ * string; default empty.}
+ * @config{block_allocation, configure block allocation. Permitted
+ * values are \c "first" or \c "best"; the \c "first" configuration uses
+ * a first-available algorithm during block allocation\, the \c "best"
+ * configuration uses a best-fit algorithm., a string\, chosen from the
+ * following options: \c "first"\, \c "best"; default \c best.}
+ * @config{block_compressor, configure a compressor for file blocks.
+ * Permitted values are empty (off) or \c "bzip2"\, \c "snappy" or
+ * custom compression engine \c "name" created with
+ * WT_CONNECTION::add_compressor. See @ref compression for more
+ * information., a string; default empty.}
+ * @config{cache_resident, do not ever evict the object's pages; see
+ * @ref tuning_cache_resident for more information., a boolean flag;
+ * default \c false.}
+ * @config{checksum, configure block checksums; permitted values are
+ * <code>on</code> (checksum all blocks)\, <code>off</code> (checksum no
+ * blocks) and <code>uncompresssed</code> (checksum only blocks which
+ * are not compressed for any reason). The \c uncompressed setting is
+ * for applications which can rely on decompression to fail if a block
+ * has been corrupted., a string\, chosen from the following options: \c
+ * "on"\, \c "off"\, \c "uncompressed"; default \c uncompressed.}
+ * @config{colgroups, comma-separated list of names of column groups.
+ * Each column group is stored separately\, keyed by the primary key of
+ * the table. If no column groups are specified\, all columns are
+ * stored together in a single file. All value columns in the table
+ * must appear in at least one column group. Each column group must be
+ * created with a separate call to WT_SESSION::create., a list of
+ * strings; default empty.}
+ * @config{collator, configure custom collation for keys. Value must be
+ * a collator name created with WT_CONNECTION::add_collator., a string;
+ * default empty.}
+ * @config{columns, list of the column names. Comma-separated list of
+ * the form <code>(column[\,...])</code>. For tables\, the number of
+ * entries must match the total number of values in \c key_format and \c
+ * value_format. For colgroups and indices\, all column names must
+ * appear in the list of columns for the table., a list of strings;
+ * default empty.}
+ * @config{dictionary, the maximum number of unique values remembered in
+ * the Btree row-store leaf page value dictionary; see @ref
+ * file_formats_compression for more information., an integer greater
+ * than or equal to 0; default \c 0.}
+ * @config{exclusive, fail if the object exists. When false (the
+ * default)\, if the object exists\, check that its settings match the
+ * specified configuration., a boolean flag; default \c false.}
+ * @config{format, the file format., a string\, chosen from the
+ * following options: \c "btree"; default \c btree.}
+ * @config{huffman_key, configure Huffman encoding for keys. Permitted
+ * values are empty (off)\, \c "english"\, \c "utf8<file>" or \c
+ * "utf16<file>". See @ref huffman for more information., a string;
+ * default empty.}
+ * @config{huffman_value, configure Huffman encoding for values.
+ * Permitted values are empty (off)\, \c "english"\, \c "utf8<file>" or
+ * \c "utf16<file>". See @ref huffman for more information., a string;
+ * default empty.}
+ * @config{internal_item_max, the largest key stored within an internal
+ * node\, in bytes. If non-zero\, any key larger than the specified
+ * size will be stored as an overflow item (which may require additional
+ * I/O to access). If zero\, a default size is chosen that permits at
+ * least 8 keys per internal page., an integer greater than or equal to
+ * 0; default \c 0.}
+ * @config{internal_key_truncate, configure internal key truncation\,
+ * discarding unnecessary trailing bytes on internal keys (ignored for
+ * custom collators)., a boolean flag; default \c true.}
+ * @config{internal_page_max, the maximum page size for internal nodes\,
+ * in bytes; the size must be a multiple of the allocation size and is
+ * significant for applications wanting to avoid excessive L2 cache
+ * misses while searching the tree. The page maximum is the bytes of
+ * uncompressed data\, that is\, the limit is applied before any block
+ * compression is done., an integer between 512B and 512MB; default \c
+ * 4KB.}
+ * @config{key_format, the format of the data packed into key items.
+ * See @ref schema_format_types for details. By default\, the
+ * key_format is \c 'u' and applications use WT_ITEM structures to
+ * manipulate raw byte arrays. By default\, records are stored in
+ * row-store files: keys of type \c 'r' are record numbers and records
+ * referenced by record number are stored in column-store files., a
+ * format string; default \c u.}
+ * @config{leaf_item_max, the largest key or value stored within a leaf
+ * node\, in bytes. If non-zero\, any key or value larger than the
+ * specified size will be stored as an overflow item (which may require
+ * additional I/O to access). If zero\, a default size is chosen that
+ * permits at least 4 key and value pairs per leaf page., an integer
+ * greater than or equal to 0; default \c 0.}
+ * @config{leaf_page_max, the maximum page size for leaf nodes\, in
+ * bytes; the size must be a multiple of the allocation size\, and is
+ * significant for applications wanting to maximize sequential data
+ * transfer from a storage device. The page maximum is the bytes of
+ * uncompressed data\, that is\, the limit is applied before any block
+ * compression is done., an integer between 512B and 512MB; default \c
+ * 32KB.}
+ * @config{lsm = (, options only relevant for LSM data sources., a set
+ * of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;auto_throttle, Throttle inserts into
+ * LSM trees if flushing to disk isn't keeping up., a boolean flag;
+ * default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom, create bloom
+ * filters on LSM tree chunks as they are merged., a boolean flag;
+ * default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_bit_count,
+ * the number of bits used per item for LSM bloom filters., an integer
+ * between 2 and 1000; default \c 16.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_config, config string used when
+ * creating Bloom filter files\, passed to WT_SESSION::create., a
+ * string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_hash_count, the number of hash
+ * values per item used for LSM bloom filters., an integer between 2 and
+ * 100; default \c 8.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;bloom_oldest,
+ * create a bloom filter on the oldest LSM tree chunk. Only supported
+ * if bloom filters are enabled., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_max, the maximum size a single
+ * chunk can be. Chunks larger than this size are not considered for
+ * further merges. This is a soft limit\, and chunks larger than this
+ * value can be created. Must be larger than chunk_size., an integer
+ * between 100MB and 10TB; default \c 5GB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_size, the maximum size of the
+ * in-memory chunk of an LSM tree. This limit is soft - it is possible
+ * for chunks to be temporarily larger than this value. This overrides
+ * the \c memory_page_max setting., an integer between 512K and 500MB;
+ * default \c 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_max, the
+ * maximum number of chunks to include in a merge operation., an integer
+ * between 2 and 100; default \c 15.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_min, the minimum number of
+ * chunks to include in a merge operation. If set to 0 or 1 half the
+ * value of merge_max is used., an integer no more than 100; default \c
+ * 0.}
+ * @config{ ),,}
+ * @config{memory_page_max, the maximum size a page can grow to in
+ * memory before being reconciled to disk. The specified size will be
+ * adjusted to a lower bound of <code>50 * leaf_page_max</code>\, and an
+ * upper bound of <code>cache_size / 2</code>. This limit is soft - it
+ * is possible for pages to be temporarily larger than this value. This
+ * setting is ignored for LSM trees\, see \c chunk_size., an integer
+ * between 512B and 10TB; default \c 5MB.}
+ * @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
+ * in bytes. If non-zero\, schedule writes for dirty blocks belonging
+ * to this object in the system buffer cache after that many bytes from
+ * this object are written into the buffer cache., an integer greater
+ * than or equal to 0; default \c 0.}
+ * @config{os_cache_max, maximum system buffer cache usage\, in bytes.
+ * If non-zero\, evict object blocks from the system buffer cache after
+ * that many bytes from this object are read or written into the buffer
+ * cache., an integer greater than or equal to 0; default \c 0.}
+ * @config{prefix_compression, configure prefix compression on row-store
+ * leaf pages., a boolean flag; default \c false.}
+ * @config{prefix_compression_min, minimum gain before prefix
+ * compression will be used on row-store leaf pages., an integer greater
+ * than or equal to 0; default \c 4.}
+ * @config{split_pct, the Btree page split size as a percentage of the
+ * maximum Btree page size\, that is\, when a Btree page is split\, it
+ * will be split into smaller pages\, where each page is the specified
+ * percentage of the maximum Btree page size., an integer between 25 and
+ * 100; default \c 75.}
+ * @config{type, set the type of data source used to store a column
+ * group\, index or simple table. By default\, a \c "file:" URI is
+ * derived from the object name. The \c type configuration can be used
+ * to switch to a different data source\, such as LSM or an extension
+ * configured by the application., a string; default \c file.}
+ * @config{value_format, the format of the data packed into value items.
+ * See @ref schema_format_types for details. By default\, the
+ * value_format is \c 'u' and applications use a WT_ITEM structure to
+ * manipulate raw byte arrays. Value items of type 't' are bitfields\,
+ * and when configured with record number type keys\, will be stored
+ * using a fixed-length store., a format string; default \c u.}
+ * @configend
+ * @errors
+ */
+ int __F(create)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Compact a live row- or column-store btree or LSM tree.
+ *
+ * @snippet ex_all.c Compact a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to compact, such as
+ * \c "table:stock"
+ * @configstart{session.compact, see dist/api_data.py}
+ * @config{timeout, maximum amount of time to allow for compact in
+ * seconds. The actual amount of time spent in compact may exceed the
+ * configured value. A value of zero disables the timeout., an integer;
+ * default \c 1200.}
+ * @configend
+ * @errors
+ */
+ int __F(compact)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Drop (delete) an object.
+ *
+ * @snippet ex_all.c Drop a table
+ *
+ * @param session the session handle
+ * @param name the URI of the object to drop, such as \c "table:stock"
+ * @configstart{session.drop, see dist/api_data.py}
+ * @config{force, return success if the object does not exist., a
+ * boolean flag; default \c false.}
+ * @config{remove_files, should the underlying files be removed?., a
+ * boolean flag; default \c true.}
+ * @configend
+ * @ebusy_errors
+ */
+ int __F(drop)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Insert a ::WT_LOGREC_MESSAGE type record in the database log files
+ * (the database must be configured for logging when this method is
+ * called).
+ *
+ * @param session the session handle
+ * @param fmt a printf format specifier
+ * @errors
+ */
+ int __F(log_printf)(WT_SESSION *session, const char *fmt, ...);
+
+ /*!
+ * Rename an object.
+ *
+ * @snippet ex_all.c Rename a table
+ *
+ * @param session the session handle
+ * @param uri the current URI of the object, such as \c "table:old"
+ * @param newuri the new URI of the object, such as \c "table:new"
+ * @configempty{session.rename, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(rename)(WT_SESSION *session,
+ const char *uri, const char *newuri, const char *config);
+
+ /*!
+ * Salvage a file or table
+ *
+ * Salvage rebuilds the file, or files of which a table is comprised,
+ * discarding any corrupted file blocks.
+ *
+ * Previously deleted records may re-appear, and inserted records may
+ * disappear, when salvage is done, so salvage should not be run
+ * unless it is known to be necessary. Normally, salvage should be
+ * called after a file or table has been corrupted, as reported by the
+ * WT_SESSION::verify method.
+ *
+ * Files are rebuilt in place, the salvage method overwrites the
+ * existing files.
+ *
+ * @snippet ex_all.c Salvage a table
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to salvage
+ * @configstart{session.salvage, see dist/api_data.py}
+ * @config{force, force salvage even of files that do not appear to be
+ * WiredTiger files., a boolean flag; default \c false.}
+ * @configend
+ * @ebusy_errors
+ */
+ int __F(salvage)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Truncate a file, table or cursor range.
+ *
+ * Truncate a file or table.
+ * @snippet ex_all.c Truncate a table
+ *
+ * Truncate a cursor range. When truncating based on a cursor position,
+ * it is not required the cursor reference a record in the object, only
+ * that the key be set. This allows applications to discard portions of
+ * the object name space without knowing exactly what records the object
+ * contains.
+ * @snippet ex_all.c Truncate a range
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to truncate
+ * @param start optional cursor marking the first record discarded;
+ * if <code>NULL</code>, the truncate starts from the beginning of
+ * the object
+ * @param stop optional cursor marking the last record discarded;
+ * if <code>NULL</code>, the truncate continues to the end of the
+ * object
+ * @configempty{session.truncate, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(truncate)(WT_SESSION *session,
+ const char *name,
+ WT_HANDLE_NULLABLE(WT_CURSOR) *start,
+ WT_HANDLE_NULLABLE(WT_CURSOR) *stop,
+ const char *config);
+
+ /*!
+ * Upgrade a file or table.
+ *
+ * Upgrade upgrades a file or table, if upgrade is required.
+ *
+ * @snippet ex_all.c Upgrade a table
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to upgrade
+ * @configempty{session.upgrade, see dist/api_data.py}
+ * @ebusy_errors
+ */
+ int __F(upgrade)(WT_SESSION *session,
+ const char *name, const char *config);
+
+ /*!
+ * Verify a file or table.
+ *
+ * Verify reports if a file, or the files of which a table is
+ * comprised, have been corrupted. The WT_SESSION::salvage method
+ * can be used to repair a corrupted file,
+ *
+ * @snippet ex_all.c Verify a table
+ *
+ * @param session the session handle
+ * @param name the URI of the file or table to verify
+ * @configstart{session.verify, see dist/api_data.py}
+ * @config{dump_address, Display addresses and page types as pages are
+ * verified\, using the application's message handler\, intended for
+ * debugging., a boolean flag; default \c false.}
+ * @config{dump_blocks, Display the contents of on-disk blocks as they
+ * are verified\, using the application's message handler\, intended for
+ * debugging., a boolean flag; default \c false.}
+ * @config{dump_offsets, Display the contents of specific on-disk
+ * blocks\, using the application's message handler\, intended for
+ * debugging., a list of strings; default empty.}
+ * @config{dump_pages, Display the contents of in-memory pages as they
+ * are verified\, using the application's message handler\, intended for
+ * debugging., a boolean flag; default \c false.}
+ * @configend
+ * @ebusy_errors
+ */
+ int __F(verify)(WT_SESSION *session,
+ const char *name, const char *config);
+ /*! @} */
+
+ /*!
+ * @name Transactions
+ * @{
+ */
+ /*!
+ * Start a transaction in this session.
+ *
+ * The transaction remains active until ended by
+ * WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction.
+ * Operations performed on cursors capable of supporting transactional
+ * operations that are already open in this session, or which are opened
+ * before the transaction ends, will operate in the context of the
+ * transaction.
+ *
+ * WT_SESSION::begin_transaction will fail if a transaction is already
+ * in progress in the session.
+ *
+ * @snippet ex_all.c transaction commit/rollback
+ *
+ * @param session the session handle
+ * @configstart{session.begin_transaction, see dist/api_data.py}
+ * @config{isolation, the isolation level for this transaction; defaults
+ * to the session's isolation level., a string\, chosen from the
+ * following options: \c "read-uncommitted"\, \c "read-committed"\, \c
+ * "snapshot"; default empty.}
+ * @config{name, name of the transaction for tracing and debugging., a
+ * string; default empty.}
+ * @config{priority, priority of the transaction for resolving
+ * conflicts. Transactions with higher values are less likely to
+ * abort., an integer between -100 and 100; default \c 0.}
+ * @config{sync, whether to sync log records when the transaction
+ * commits\, inherited from ::wiredtiger_open \c transaction_sync., a
+ * boolean flag; default empty.}
+ * @configend
+ * @errors
+ */
+ int __F(begin_transaction)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Commit the current transaction.
+ *
+ * A transaction must be in progress when this method is called.
+ *
+ * If WT_SESSION::commit_transaction returns an error, the transaction
+ * was rolled back, not committed.
+ *
+ * @snippet ex_all.c transaction commit/rollback
+ *
+ * @param session the session handle
+ * @configempty{session.commit_transaction, see dist/api_data.py}
+ * @errors
+ */
+ int __F(commit_transaction)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Roll back the current transaction.
+ *
+ * A transaction must be in progress when this method is called.
+ *
+ * All cursors are reset.
+ *
+ * @snippet ex_all.c transaction commit/rollback
+ *
+ * @param session the session handle
+ * @configempty{session.rollback_transaction, see dist/api_data.py}
+ * @errors
+ */
+ int __F(rollback_transaction)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Write a transactionally consistent snapshot of a database or set of
+ * objects. The checkpoint includes all transactions committed before
+ * the checkpoint starts. Additionally, checkpoints may optionally be
+ * discarded.
+ *
+ * @snippet ex_all.c Checkpoint examples
+ *
+ * @param session the session handle
+ * @configstart{session.checkpoint, see dist/api_data.py}
+ * @config{drop, specify a list of checkpoints to drop. The list may
+ * additionally contain one of the following keys: \c "from=all" to drop
+ * all checkpoints\, \c "from=<checkpoint>" to drop all checkpoints
+ * after and including the named checkpoint\, or \c "to=<checkpoint>" to
+ * drop all checkpoints before and including the named checkpoint.
+ * Checkpoints cannot be dropped while a hot backup is in progress or if
+ * open in a cursor., a list of strings; default empty.}
+ * @config{force, by default\, checkpoints may be skipped if the
+ * underlying object has not been modified\, this option forces the
+ * checkpoint., a boolean flag; default \c false.}
+ * @config{name, if non-empty\, specify a name for the checkpoint (note
+ * that checkpoints including LSM trees may not be named)., a string;
+ * default empty.}
+ * @config{target, if non-empty\, checkpoint the list of objects., a
+ * list of strings; default empty.}
+ * @configend
+ * @errors
+ */
+ int __F(checkpoint)(WT_SESSION *session, const char *config);
+
+ /*!
+ * Return the transaction ID range pinned by the session handle.
+ *
+ * The ID range is approximate and is calculated based on the oldest
+ * ID needed for the active transaction in this session, compared
+ * to the newest transaction in the system.
+ *
+ * @snippet ex_all.c transaction pinned range
+ *
+ * @param session the session handle
+ * @param[out] range the range of IDs pinned by this session. Zero if
+ * there is no active transaction.
+ * @errors
+ */
+ int __F(transaction_pinned_range)(WT_SESSION* session, uint64_t *range);
+
+ /*! @} */
+};
+
+/*!
+ * A connection to a WiredTiger database. The connection may be opened within
+ * the same address space as the caller or accessed over a socket connection.
+ *
+ * Most applications will open a single connection to a database for each
+ * process. The first process to open a connection to a database will access
+ * the database in its own address space. Subsequent connections (if allowed)
+ * will communicate with the first process over a socket connection to perform
+ * their operations.
+ *
+ * <b>Thread safety:</b> A WT_CONNECTION handle may be shared between threads,
+ * see @ref threads for more information.
+ */
+struct __wt_connection {
+ /*!
+ * @name Async operation handles
+ * @{
+ */
+ /*!
+ * Wait for all outstanding operations to complete.
+ *
+ * @snippet ex_async.c async flush
+ *
+ * @param connection the connection handle
+ * @errors
+ */
+ int __F(async_flush)(WT_CONNECTION *connection);
+
+ /*!
+ * Return an async operation handle
+ *
+ * @snippet ex_async.c async handle allocation
+ *
+ * @param connection the connection handle
+ * @param uri the connection handle
+ * @configstart{connection.async_new_op, see dist/api_data.py}
+ * @config{append, append the value as a new record\, creating a new
+ * record number key; valid only for operations with record number
+ * keys., a boolean flag; default \c false.}
+ * @config{overwrite, configures whether the cursor's insert\, update
+ * and remove methods check the existing state of the record. If \c
+ * overwrite is \c false\, WT_CURSOR::insert fails with
+ * ::WT_DUPLICATE_KEY if the record exists\, WT_CURSOR::update and
+ * WT_CURSOR::remove fail with ::WT_NOTFOUND if the record does not
+ * exist., a boolean flag; default \c true.}
+ * @config{raw, ignore the encodings for the key and value\, manage data
+ * as if the formats were \c "u". See @ref cursor_raw for details., a
+ * boolean flag; default \c false.}
+ * @config{timeout, maximum amount of time to allow for compact in
+ * seconds. The actual amount of time spent in compact may exceed the
+ * configured value. A value of zero disables the timeout., an integer;
+ * default \c 1200.}
+ * @configend
+ * @param callback the operation callback
+ * @param[out] asyncopp the new op handle
+ * @errors
+ * If there are no available handles, \c EBUSY is returned.
+ */
+ int __F(async_new_op)(WT_CONNECTION *connection,
+ const char *uri, const char *config, WT_ASYNC_CALLBACK *callback,
+ WT_ASYNC_OP **asyncopp);
+ /*! @} */
+
+ /*!
+ * Close a connection.
+ *
+ * Any open sessions will be closed.
+ *
+ * @snippet ex_all.c Close a connection
+ *
+ * @param connection the connection handle
+ * @configstart{connection.close, see dist/api_data.py}
+ * @config{leak_memory, don't free memory during close., a boolean flag;
+ * default \c false.}
+ * @configend
+ * @errors
+ */
+ int __F(close)(WT_HANDLE_CLOSED(WT_CONNECTION) *connection,
+ const char *config);
+
+ /*!
+ * Reconfigure a connection handle.
+ *
+ * @snippet ex_all.c Reconfigure a connection
+ *
+ * @param connection the connection handle
+ * @configstart{connection.reconfigure, see dist/api_data.py}
+ * @config{async = (, asynchronous operations configuration options., a
+ * set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable asynchronous
+ * operation., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;ops_max, maximum number of expected
+ * simultaneous asynchronous operations., an integer between 10 and
+ * 4096; default \c 1024.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads, the
+ * number of worker threads to service asynchronous requests., an
+ * integer between 1 and 20; default \c 2.}
+ * @config{ ),,}
+ * @config{cache_size, maximum heap memory to allocate for the cache. A
+ * database should configure either a cache_size or a shared_cache not
+ * both., an integer between 1MB and 10TB; default \c 100MB.}
+ * @config{checkpoint = (, periodically checkpoint the database., a set
+ * of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;log_size, wait for this amount of log
+ * record bytes to be written to the log between each checkpoint. A
+ * database can configure both log_size and wait to set an upper bound
+ * for checkpoints; setting this value above 0 configures periodic
+ * checkpoints., an integer between 0 and 2GB; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the checkpoint name., a string;
+ * default \c "WiredTigerCheckpoint".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * checkpoint; setting this value above 0 configures periodic
+ * checkpoints., an integer between 0 and 100000; default \c 0.}
+ * @config{ ),,}
+ * @config{error_prefix, prefix string for error messages., a string;
+ * default empty.}
+ * @config{eviction = (, eviction configuration options., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_max, maximum number of
+ * threads WiredTiger will start to help evict pages from cache. The
+ * number of threads started will vary depending on the current eviction
+ * load., an integer between 1 and 20; default \c 1.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_min, minimum number of
+ * threads WiredTiger will start to help evict pages from cache. The
+ * number of threads currently running will vary depending on the
+ * current eviction load., an integer between 1 and 20; default \c 1.}
+ * @config{ ),,}
+ * @config{eviction_dirty_target, continue evicting until the cache has
+ * less dirty memory than the value\, as a percentage of the total cache
+ * size. Dirty pages will only be evicted if the cache is full enough
+ * to trigger eviction., an integer between 10 and 99; default \c 80.}
+ * @config{eviction_target, continue evicting until the cache has less
+ * total memory than the value\, as a percentage of the total cache
+ * size. Must be less than \c eviction_trigger., an integer between 10
+ * and 99; default \c 80.}
+ * @config{eviction_trigger, trigger eviction when the cache is using
+ * this much memory\, as a percentage of the total cache size., an
+ * integer between 10 and 99; default \c 95.}
+ * @config{lsm_manager = (, configure database wide options for LSM tree
+ * management., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge, merge LSM chunks where
+ * possible., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;worker_thread_max, Configure a set of
+ * threads to manage merging LSM trees in the database., an integer
+ * between 3 and 20; default \c 4.}
+ * @config{ ),,}
+ * @config{shared_cache = (, shared cache configuration options. A
+ * database should configure either a cache_size or a shared_cache not
+ * both., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk, the granularity that a shared
+ * cache is redistributed., an integer between 1MB and 10TB; default \c
+ * 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is
+ * shared between databases., a string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this
+ * database is guaranteed to have available from the shared cache. This
+ * setting is per database. Defaults to the chunk size., an integer;
+ * default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory
+ * to allocate for the shared cache. Setting this will update the value
+ * if one is already set., an integer between 1MB and 10TB; default \c
+ * 500MB.}
+ * @config{ ),,}
+ * @config{statistics, Maintain database statistics\, which may impact
+ * performance. Choosing "all" maintains all statistics regardless of
+ * cost\, "fast" maintains a subset of statistics that are relatively
+ * inexpensive\, "none" turns off all statistics. The "clear"
+ * configuration resets statistics after they are gathered\, where
+ * appropriate (for example\, a cache size statistic is not cleared\,
+ * while the count of cursor insert operations will be cleared). When
+ * "clear" is configured for the database\, gathered statistics are
+ * reset each time a statistics cursor is used to gather statistics\, as
+ * well as each time statistics are logged using the \c statistics_log
+ * configuration. See @ref statistics for more information., a list\,
+ * with values chosen from the following options: \c "all"\, \c "fast"\,
+ * \c "none"\, \c "clear"; default \c none.}
+ * @config{statistics_log = (, log any statistics the database is
+ * configured to maintain\, to a file. See @ref statistics for more
+ * information., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;on_close, log statistics on database
+ * close., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the pathname to a file into
+ * which the log records are written\, may contain ISO C standard
+ * strftime conversion specifications. If the value is not an absolute
+ * path name\, the file is created relative to the database home., a
+ * string; default \c "WiredTigerStat.%d.%H".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;sources, if non-empty\, include
+ * statistics for the list of data source URIs\, if they are open at the
+ * time of the statistics logging. The list may include URIs matching a
+ * single data source ("table:mytable")\, or a URI matching all data
+ * sources of a particular type ("table:")., a list of strings; default
+ * empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp
+ * prepended to each log record\, may contain strftime conversion
+ * specifications., a string; default \c "%b %d %H:%M:%S".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * write of the log records., an integer between 0 and 100000; default
+ * \c 0.}
+ * @config{ ),,}
+ * @config{verbose, enable messages for various events. Only available
+ * if WiredTiger is configured with --enable-verbose. Options are given
+ * as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a
+ * list\, with values chosen from the following options: \c "api"\, \c
+ * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c
+ * "evictserver"\, \c "fileops"\, \c "log"\, \c "lsm"\, \c "metadata"\,
+ * \c "mutex"\, \c "overflow"\, \c "read"\, \c "reconcile"\, \c
+ * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c
+ * "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\, \c
+ * "write"; default empty.}
+ * @configend
+ * @errors
+ */
+ int __F(reconfigure)(WT_CONNECTION *connection, const char *config);
+
+ /*!
+ * The home directory of the connection.
+ *
+ * @snippet ex_all.c Get the database home directory
+ *
+ * @param connection the connection handle
+ * @returns a pointer to a string naming the home directory
+ */
+ const char *__F(get_home)(WT_CONNECTION *connection);
+
+ /*!
+ * Add configuration options for a method. See
+ * @ref custom_ds_config_add for more information.
+ *
+ * @snippet ex_all.c Configure method configuration
+ *
+ * @param connection the connection handle
+ * @param method the name of the method
+ * @param uri the object type or NULL for all object types
+ * @param config the additional configuration's name and default value
+ * @param type the additional configuration's type (must be one of
+ * \c "boolean"\, \c "int", \c "list" or \c "string")
+ * @param check the additional configuration check string, or NULL if
+ * none
+ * @errors
+ */
+ int __F(configure_method)(WT_CONNECTION *connection,
+ const char *method, const char *uri,
+ const char *config, const char *type, const char *check);
+
+ /*!
+ * Return if opening this handle created the database.
+ *
+ * @snippet ex_all.c Check if the database is newly created
+ *
+ * @param connection the connection handle
+ * @returns false (zero) if the connection existed before the call to
+ * ::wiredtiger_open, true (non-zero) if it was created by opening this
+ * handle.
+ */
+ int __F(is_new)(WT_CONNECTION *connection);
+
+ /*!
+ * @name Session handles
+ * @{
+ */
+ /*!
+ * Open a session.
+ *
+ * @snippet ex_all.c Open a session
+ *
+ * @param connection the connection handle
+ * @param errhandler An error handler. If <code>NULL</code>, the
+ * connection's error handler is used
+ * @configstart{connection.open_session, see dist/api_data.py}
+ * @config{isolation, the default isolation level for operations in this
+ * session., a string\, chosen from the following options: \c
+ * "read-uncommitted"\, \c "read-committed"\, \c "snapshot"; default \c
+ * read-committed.}
+ * @configend
+ * @param[out] sessionp the new session handle
+ * @errors
+ */
+ int __F(open_session)(WT_CONNECTION *connection,
+ WT_EVENT_HANDLER *errhandler, const char *config,
+ WT_SESSION **sessionp);
+ /*! @} */
+
+ /*!
+ * @name Extensions
+ * @{
+ */
+ /*!
+ * Load an extension.
+ *
+ * @snippet ex_all.c Load an extension
+ *
+ * @param connection the connection handle
+ * @param path the filename of the extension module, or \c "local" to
+ * search the current application binary for the initialization
+ * function, see @ref extensions for more details.
+ * @configstart{connection.load_extension, see dist/api_data.py}
+ * @config{config, configuration string passed to the entry point of the
+ * extension as its WT_CONFIG_ARG argument., a string; default empty.}
+ * @config{entry, the entry point of the extension\, called to
+ * initialize the extension when it is loaded. The signature of the
+ * function must match ::wiredtiger_extension_init., a string; default
+ * \c wiredtiger_extension_init.}
+ * @config{terminate, an optional function in the extension that is
+ * called before the extension is unloaded during WT_CONNECTION::close.
+ * The signature of the function must match
+ * ::wiredtiger_extension_terminate., a string; default \c
+ * wiredtiger_extension_terminate.}
+ * @configend
+ * @errors
+ */
+ int __F(load_extension)(WT_CONNECTION *connection,
+ const char *path, const char *config);
+
+ /*!
+ * Add a custom data source. See @ref custom_data_sources for more
+ * information.
+ *
+ * The application must first implement the WT_DATA_SOURCE interface
+ * and then register the implementation with WiredTiger:
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE register
+ *
+ * @param connection the connection handle
+ * @param prefix the URI prefix for this data source, e.g., "file:"
+ * @param data_source the application-supplied implementation of
+ * WT_DATA_SOURCE to manage this data source.
+ * @configempty{connection.add_data_source, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_data_source)(WT_CONNECTION *connection, const char *prefix,
+ WT_DATA_SOURCE *data_source, const char *config);
+
+ /*!
+ * Add a custom collation function.
+ *
+ * The application must first implement the WT_COLLATOR interface and
+ * then register the implementation with WiredTiger:
+ *
+ * @snippet ex_all.c WT_COLLATOR register
+ *
+ * @param connection the connection handle
+ * @param name the name of the collation to be used in calls to
+ * WT_SESSION::create
+ * @param collator the application-supplied collation handler
+ * @configempty{connection.add_collator, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_collator)(WT_CONNECTION *connection,
+ const char *name, WT_COLLATOR *collator, const char *config);
+
+ /*!
+ * Add a compression function.
+ *
+ * The application must first implement the WT_COMPRESSOR interface
+ * and then register the implementation with WiredTiger:
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ *
+ * @param connection the connection handle
+ * @param name the name of the compression function to be used in calls
+ * to WT_SESSION::create
+ * @param compressor the application-supplied compression handler
+ * @configempty{connection.add_compressor, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_compressor)(WT_CONNECTION *connection,
+ const char *name, WT_COMPRESSOR *compressor, const char *config);
+
+ /*!
+ * Add a custom extractor for index keys or column groups.
+ * @notyet{custom extractors}
+ *
+ * The application must first implement the WT_EXTRACTOR interface and
+ * then register the implementation with WiredTiger:
+ *
+ * @snippet ex_all.c WT_EXTRACTOR register
+ *
+ * @param connection the connection handle
+ * @param name the name of the extractor to be used in calls to
+ * WT_SESSION::create
+ * @param extractor the application-supplied extractor
+ * @configempty{connection.add_extractor, see dist/api_data.py}
+ * @errors
+ */
+ int __F(add_extractor)(WT_CONNECTION *connection, const char *name,
+ WT_EXTRACTOR *extractor, const char *config);
+
+ /*!
+ * Return a reference to the WiredTiger extension functions.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API declaration
+ *
+ * @param wt_conn the WT_CONNECTION handle
+ * @returns a reference to a WT_EXTENSION_API structure.
+ */
+ WT_EXTENSION_API *__F(get_extension_api)(WT_CONNECTION *wt_conn);
+ /*! @} */
+};
+
+/*!
+ * Open a connection to a database.
+ *
+ * @snippet ex_all.c Open a connection
+ *
+ * @param home The path to the database home directory. See @ref home
+ * for more information.
+ * @param errhandler An error handler. If <code>NULL</code>, a builtin error
+ * handler is installed that writes error messages to stderr
+ * @configstart{wiredtiger_open, see dist/api_data.py}
+ * @config{async = (, asynchronous operations configuration options., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable asynchronous operation., a
+ * boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;ops_max,
+ * maximum number of expected simultaneous asynchronous operations., an integer
+ * between 10 and 4096; default \c 1024.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads, the number of worker threads to
+ * service asynchronous requests., an integer between 1 and 20; default \c 2.}
+ * @config{ ),,}
+ * @config{buffer_alignment, in-memory alignment (in bytes) for buffers used for
+ * I/O. The default value of -1 indicates a platform-specific alignment value
+ * should be used (4KB on Linux systems\, zero elsewhere)., an integer between
+ * -1 and 1MB; default \c -1.}
+ * @config{cache_size, maximum heap memory to allocate for the cache. A
+ * database should configure either a cache_size or a shared_cache not both., an
+ * integer between 1MB and 10TB; default \c 100MB.}
+ * @config{checkpoint = (, periodically checkpoint the database., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;log_size, wait for this amount of log record
+ * bytes to be written to the log between each checkpoint. A database can
+ * configure both log_size and wait to set an upper bound for checkpoints;
+ * setting this value above 0 configures periodic checkpoints., an integer
+ * between 0 and 2GB; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the
+ * checkpoint name., a string; default \c "WiredTigerCheckpoint".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait between each
+ * checkpoint; setting this value above 0 configures periodic checkpoints., an
+ * integer between 0 and 100000; default \c 0.}
+ * @config{ ),,}
+ * @config{checkpoint_sync, flush files to stable storage when closing or
+ * writing checkpoints., a boolean flag; default \c true.}
+ * @config{config_base, write the base configuration file if creating the
+ * database\, see @ref config_base for more information., a boolean flag;
+ * default \c true.}
+ * @config{create, create the database if it does not exist., a boolean flag;
+ * default \c false.}
+ * @config{direct_io, Use \c O_DIRECT to access files. Options are given as a
+ * list\, such as <code>"direct_io=[data]"</code>. Configuring \c direct_io
+ * requires care\, see @ref tuning_system_buffer_cache_direct_io for important
+ * warnings. Including \c "data" will cause WiredTiger data files to use \c
+ * O_DIRECT\, including \c "log" will cause WiredTiger log files to use \c
+ * O_DIRECT\, and including \c "checkpoint" will cause WiredTiger data files
+ * opened at a checkpoint (i.e: read only) to use \c O_DIRECT., a list\, with
+ * values chosen from the following options: \c "checkpoint"\, \c "data"\, \c
+ * "log"; default empty.}
+ * @config{error_prefix, prefix string for error messages., a string; default
+ * empty.}
+ * @config{eviction = (, eviction configuration options., a set of related
+ * configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_max, maximum number of threads
+ * WiredTiger will start to help evict pages from cache. The number of threads
+ * started will vary depending on the current eviction load., an integer between
+ * 1 and 20; default \c 1.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;threads_min, minimum
+ * number of threads WiredTiger will start to help evict pages from cache. The
+ * number of threads currently running will vary depending on the current
+ * eviction load., an integer between 1 and 20; default \c 1.}
+ * @config{ ),,}
+ * @config{eviction_dirty_target, continue evicting until the cache has less
+ * dirty memory than the value\, as a percentage of the total cache size. Dirty
+ * pages will only be evicted if the cache is full enough to trigger eviction.,
+ * an integer between 10 and 99; default \c 80.}
+ * @config{eviction_target, continue evicting until the cache has less total
+ * memory than the value\, as a percentage of the total cache size. Must be
+ * less than \c eviction_trigger., an integer between 10 and 99; default \c 80.}
+ * @config{eviction_trigger, trigger eviction when the cache is using this much
+ * memory\, as a percentage of the total cache size., an integer between 10 and
+ * 99; default \c 95.}
+ * @config{exclusive, fail if the database already exists\, generally used with
+ * the \c create option., a boolean flag; default \c false.}
+ * @config{extensions, list of shared library extensions to load (using dlopen).
+ * Any values specified to an library extension are passed to
+ * WT_CONNECTION::load_extension as the \c config parameter (for example\,
+ * <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings;
+ * default empty.}
+ * @config{file_extend, file extension configuration. If set\, extend files of
+ * the set type in allocations of the set size\, instead of a block at a time as
+ * each new block is written. For example\,
+ * <code>file_extend=(data=16MB)</code>., a list\, with values chosen from the
+ * following options: \c "data"\, \c "log"; default empty.}
+ * @config{hazard_max, maximum number of simultaneous hazard pointers per
+ * session handle., an integer greater than or equal to 15; default \c 1000.}
+ * @config{log = (, enable logging., a set of related configuration options
+ * defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;archive, automatically
+ * archive unneeded log files., a boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, enable logging subsystem., a boolean
+ * flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;file_max, the
+ * maximum size of log files., an integer between 100KB and 2GB; default \c
+ * 100MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the path to a directory into
+ * which the log files are written. If the value is not an absolute path name\,
+ * the files are created relative to the database home., a string; default \c
+ * "".}
+ * @config{ ),,}
+ * @config{lsm_manager = (, configure database wide options for LSM tree
+ * management., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;merge, merge LSM chunks where possible., a
+ * boolean flag; default \c true.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;worker_thread_max, Configure a set of threads
+ * to manage merging LSM trees in the database., an integer between 3 and 20;
+ * default \c 4.}
+ * @config{ ),,}
+ * @config{mmap, Use memory mapping to access files when possible., a boolean
+ * flag; default \c true.}
+ * @config{multiprocess, permit sharing between processes (will automatically
+ * start an RPC server for primary processes and use RPC for secondary
+ * processes). <b>Not yet supported in WiredTiger</b>., a boolean flag; default
+ * \c false.}
+ * @config{session_max, maximum expected number of sessions (including server
+ * threads)., an integer greater than or equal to 1; default \c 100.}
+ * @config{shared_cache = (, shared cache configuration options. A database
+ * should configure either a cache_size or a shared_cache not both., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk, the granularity that a shared cache is
+ * redistributed., an integer between 1MB and 10TB; default \c 10MB.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;name, name of a cache that is shared between
+ * databases., a string; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database is
+ * guaranteed to have available from the shared cache. This setting is per
+ * database. Defaults to the chunk size., an integer; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for the
+ * shared cache. Setting this will update the value if one is already set., an
+ * integer between 1MB and 10TB; default \c 500MB.}
+ * @config{ ),,}
+ * @config{statistics, Maintain database statistics\, which may impact
+ * performance. Choosing "all" maintains all statistics regardless of cost\,
+ * "fast" maintains a subset of statistics that are relatively inexpensive\,
+ * "none" turns off all statistics. The "clear" configuration resets statistics
+ * after they are gathered\, where appropriate (for example\, a cache size
+ * statistic is not cleared\, while the count of cursor insert operations will
+ * be cleared). When "clear" is configured for the database\, gathered
+ * statistics are reset each time a statistics cursor is used to gather
+ * statistics\, as well as each time statistics are logged using the \c
+ * statistics_log configuration. See @ref statistics for more information., a
+ * list\, with values chosen from the following options: \c "all"\, \c "fast"\,
+ * \c "none"\, \c "clear"; default \c none.}
+ * @config{statistics_log = (, log any statistics the database is configured to
+ * maintain\, to a file. See @ref statistics for more information., a set of
+ * related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;on_close, log statistics on database close.,
+ * a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;path, the
+ * pathname to a file into which the log records are written\, may contain ISO C
+ * standard strftime conversion specifications. If the value is not an absolute
+ * path name\, the file is created relative to the database home., a string;
+ * default \c "WiredTigerStat.%d.%H".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;sources,
+ * if non-empty\, include statistics for the list of data source URIs\, if they
+ * are open at the time of the statistics logging. The list may include URIs
+ * matching a single data source ("table:mytable")\, or a URI matching all data
+ * sources of a particular type ("table:")., a list of strings; default empty.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;timestamp, a timestamp prepended to each log
+ * record\, may contain strftime conversion specifications., a string; default
+ * \c "%b %d %H:%M:%S".}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;wait, seconds to wait
+ * between each write of the log records., an integer between 0 and 100000;
+ * default \c 0.}
+ * @config{ ),,}
+ * @config{transaction_sync = (, how to sync log records when the transaction
+ * commits., a set of related configuration options defined below.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;enabled, whether to sync the log on every
+ * commit by default\, can be overridden by the \c sync setting to
+ * WT_SESSION::begin_transaction., a boolean flag; default \c false.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;method, the method used to ensure log records
+ * are stable on disk\, see @ref tune_durability for more information., a
+ * string\, chosen from the following options: \c "dsync"\, \c "fsync"\, \c
+ * "none"; default \c fsync.}
+ * @config{ ),,}
+ * @config{use_environment_priv, use the \c WIREDTIGER_CONFIG and \c
+ * WIREDTIGER_HOME environment variables regardless of whether or not the
+ * process is running with special privileges. See @ref home for more
+ * information., a boolean flag; default \c false.}
+ * @config{verbose, enable messages for various events. Only available if
+ * WiredTiger is configured with --enable-verbose. Options are given as a
+ * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with
+ * values chosen from the following options: \c "api"\, \c "block"\, \c
+ * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\,
+ * \c "log"\, \c "lsm"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c
+ * "read"\, \c "reconcile"\, \c "recovery"\, \c "salvage"\, \c "shared_cache"\,
+ * \c "split"\, \c "temporary"\, \c "transaction"\, \c "verify"\, \c "version"\,
+ * \c "write"; default empty.}
+ * @configend
+ * Additionally, if files named \c WiredTiger.config or \c WiredTiger.basecfg
+ * appear in the WiredTiger home directory, they are read for configuration
+ * values (see @ref config_file and @ref config_base for details).
+ * See @ref config_order for ordering of the configuration mechanisms.
+ * @param[out] connectionp A pointer to the newly opened connection handle
+ * @errors
+ */
+int wiredtiger_open(const char *home,
+ WT_EVENT_HANDLER *errhandler, const char *config,
+ WT_CONNECTION **connectionp);
+
+/*!
+ * Return information about an error as a string; wiredtiger_strerror is a
+ * superset of the ISO C99/POSIX 1003.1-2001 function strerror.
+ *
+ * @snippet ex_all.c Display an error
+ *
+ * @param err a return value from a WiredTiger, C library or POSIX function
+ * @returns a string representation of the error
+ */
+const char *wiredtiger_strerror(int err);
+
+#if !defined(SWIG)
+/*!
+ * The interface implemented by applications to accept notifications
+ * of the completion of asynchronous operations.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::async_new_op.
+ *
+ * @snippet ex_async.c async handle allocation
+ */
+struct __wt_async_callback {
+ /*!
+ * Callback to receive completion notification.
+ *
+ * @param[in] op the operation handle
+ * @param[in] op_ret the result of the async operation
+ * @param[in] flags currently unused
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet ex_async.c async example callback implementation
+ */
+ int (*notify)(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op,
+ int op_ret, uint32_t flags);
+};
+#endif
+
+/*!
+ * The interface implemented by applications to handle error, informational and
+ * progress messages. Entries set to NULL are ignored and the default handlers
+ * will continue to be used.
+ */
+struct __wt_event_handler {
+ /*!
+ * Callback to handle error messages; by default, error messages are
+ * written to the stderr stream.
+ *
+ * Error handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session the WiredTiger session handle in use when the error
+ * was generated. The handle may have been created by the application
+ * or automatically by WiredTiger.
+ * @param error a WiredTiger, C99 or POSIX error code, which can
+ * be converted to a string using ::wiredtiger_strerror
+ * @param message an error string
+ */
+ int (*handle_error)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *message);
+
+ /*!
+ * Callback to handle informational messages; by default, informational
+ * messages are written to the stdout stream.
+ *
+ * Message handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session the WiredTiger session handle in use when the message
+ * was generated. The handle may have been created by the application
+ * or automatically by WiredTiger.
+ * @param message an informational string
+ */
+ int (*handle_message)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *message);
+
+ /*!
+ * Callback to handle progress messages; by default, no progress
+ * messages are written.
+ *
+ * Progress handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session the WiredTiger session handle in use when the
+ * progress message was generated. The handle may have been created by
+ * the application or automatically by WiredTiger.
+ * @param operation a string representation of the operation
+ * @param progress a counter
+ */
+ int (*handle_progress)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *operation, uint64_t progress);
+
+ /*!
+ * Callback to handle automatic close of a WiredTiger handle.
+ *
+ * Close handler returns are not ignored: if the handler returns
+ * non-zero, the error may cause the WiredTiger function posting the
+ * event to fail, and may even cause operation or library failure.
+ *
+ * @param session The session handle that is being closed if the
+ * cursor parameter is NULL.
+ * @param cursor The cursor handle that is being closed, or NULL if
+ * it is a session handle being closed.
+ */
+ int (*handle_close)(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, WT_CURSOR *cursor);
+};
+
+/*!
+ * @name Data packing and unpacking
+ * @{
+ */
+
+/*!
+ * Pack a structure into a buffer.
+ *
+ * See @ref packing for a description of the permitted format strings.
+ *
+ * @section pack_examples Packing Examples
+ *
+ * For example, the string <code>"iSh"</code> will pack a 32-bit integer
+ * followed by a NUL-terminated string, followed by a 16-bit integer. The
+ * default, big-endian encoding will be used, with no alignment. This could be
+ * used in C as follows:
+ *
+ * @snippet ex_all.c Pack fields into a buffer
+ *
+ * Then later, the values can be unpacked as follows:
+ *
+ * @snippet ex_all.c Unpack fields from a buffer
+ *
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_pack(WT_SESSION *session,
+ void *buffer, size_t size, const char *format, ...);
+
+/*!
+ * Calculate the size required to pack a structure.
+ *
+ * Note that for variable-sized fields including variable-sized strings and
+ * integers, the calculated sized merely reflects the expected sizes specified
+ * in the format string itself.
+ *
+ * @snippet ex_all.c Get the packed size
+ *
+ * @param session the session handle
+ * @param sizep a location where the number of bytes needed for the
+ * matching call to ::wiredtiger_struct_pack is returned
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_size(WT_SESSION *session,
+ size_t *sizep, const char *format, ...);
+
+/*!
+ * Unpack a structure from a buffer.
+ *
+ * Reverse of ::wiredtiger_struct_pack: gets values out of a
+ * packed byte string.
+ *
+ * @snippet ex_all.c Unpack fields from a buffer
+ *
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+int wiredtiger_struct_unpack(WT_SESSION *session,
+ const void *buffer, size_t size, const char *format, ...);
+
+#if !defined(SWIG)
+
+/*!
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ * This is an opaque handle returned by ::wiredtiger_pack_start or
+ * ::wiredtiger_unpack_start. It must be closed with ::wiredtiger_pack_close.
+ */
+typedef struct __wt_pack_stream WT_PACK_STREAM;
+
+/*!
+ * Start a packing operation into a buffer with the given format string. This
+ * should be followed by a series of calls to ::wiredtiger_pack_item,
+ * ::wiredtiger_pack_int, ::wiredtiger_pack_str or ::wiredtiger_pack_uint
+ * to fill in the values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory to hold the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_pack_start(WT_SESSION *session,
+ const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Start an unpacking operation from a buffer with the given format string.
+ * This should be followed by a series of calls to ::wiredtiger_unpack_item,
+ * ::wiredtiger_unpack_int, ::wiredtiger_unpack_str or ::wiredtiger_unpack_uint
+ * to retrieve the packed values.
+ *
+ * @param session the session handle
+ * @param format the data format, see @ref packing
+ * @param buffer a pointer to memory holding the packed data
+ * @param size the size of the buffer
+ * @param[out] psp the new packing stream handle
+ * @errors
+ */
+int wiredtiger_unpack_start(WT_SESSION *session,
+ const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp);
+
+/*!
+ * Close a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] usedp the number of bytes in the buffer used by the stream
+ * @errors
+ */
+int wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp);
+
+/*!
+ * Pack an item into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to pack
+ * @errors
+ */
+int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Pack a signed integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param i a signed integer to pack
+ * @errors
+ */
+int wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i);
+
+/*!
+ * Pack a string into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param s a string to pack
+ * @errors
+ */
+int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s);
+
+/*!
+ * Pack an unsigned integer into a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param u an unsigned integer to pack
+ * @errors
+ */
+int wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u);
+
+/*!
+ * Unpack an item from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param item an item to unpack
+ * @errors
+ */
+int wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item);
+
+/*!
+ * Unpack a signed integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] ip the unpacked signed integer
+ * @errors
+ */
+int wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip);
+
+/*!
+ * Unpack a string from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] sp the unpacked string
+ * @errors
+ */
+int wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp);
+
+/*!
+ * Unpack an unsigned integer from a packing stream.
+ *
+ * @param ps the packing stream handle
+ * @param[out] up the unpacked unsigned integer
+ * @errors
+ */
+int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up);
+/*! @} */
+
+/*!
+ * @name Configuration string parsing
+ * @{
+ */
+
+/*!
+ * The configuration information returned by the WiredTiger configuration
+ * parsing functions in the WT_EXTENSION_API and the public API.
+ */
+struct __wt_config_item {
+ /*!
+ * The value of a configuration string.
+ *
+ * Regardless of the type of the configuration string (boolean, int,
+ * list or string), the \c str field will reference the value of the
+ * configuration string.
+ *
+ * The bytes referenced by \c str are <b>not</b> nul-terminated,
+ * use the \c len field instead of a terminating nul byte.
+ */
+ const char *str;
+
+ /*! The number of bytes in the value referenced by \c str. */
+ size_t len;
+
+ /*!
+ * The value of a configuration boolean or integer.
+ *
+ * If the configuration string's value is "true" or "false", the
+ * \c val field will be set to 1 (true), or 0 (false).
+ *
+ * If the configuration string can be legally interpreted as an integer,
+ * using the strtoll function rules as specified in ISO/IEC 9899:1990
+ * ("ISO C90"), that integer will be stored in the \c val field.
+ */
+ int64_t val;
+
+ /*! Permitted values of the \c type field. */
+ enum {
+ /*! A string value with quotes stripped. */
+ WT_CONFIG_ITEM_STRING,
+ /*! A boolean literal ("true" or "false"). */
+ WT_CONFIG_ITEM_BOOL,
+ /*! An unquoted identifier: a string value without quotes. */
+ WT_CONFIG_ITEM_ID,
+ /*! A numeric value. */
+ WT_CONFIG_ITEM_NUM,
+ /*! A nested structure or list, including brackets. */
+ WT_CONFIG_ITEM_STRUCT
+ }
+ /*!
+ * The type of value determined by the parser. In all cases,
+ * the \c str and \c len fields are set.
+ */
+ type;
+};
+
+/*!
+ * Create a handle that can be used to parse or create configuration strings
+ * compatible with WiredTiger APIs.
+ * This API is outside the scope of a WiredTiger connection handle, since
+ * applications may need to generate configuration strings prior to calling
+ * ::wiredtiger_open.
+ * @param session the session handle to be used for error reporting. If NULL
+ * error messages will be written to stdout.
+ * @param config the configuration string being parsed. The string must
+ * remain valid for the lifetime of the parser handle.
+ * @param len the number of valid bytes in \c config
+ * @param[out] config_parserp A pointer to the newly opened handle
+ * @errors
+ */
+int wiredtiger_config_parser_open(WT_SESSION *session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+
+/*!
+ * A handle that can be used to search and traverse configuration strings
+ * compatible with WiredTiger APIs.
+ * To parse the contents of a list or nested configuration string use a new
+ * configuration parser handle based on the content of the ::WT_CONFIG_ITEM
+ * retrieved from the parent configuration string.
+ *
+ * @section config_parse_examples Configuration String Parsing examples
+ *
+ * This could be used in C to create a configuration parser as follows:
+ *
+ * @snippet ex_config_parse.c Create a configuration parser
+ *
+ * Once the parser has been created the content can be queried directly:
+ *
+ * @snippet ex_config_parse.c get
+ *
+ * Or the content can be traversed linearly:
+ *
+ * @snippet ex_config_parse.c next
+ *
+ * Nested configuration values can be queried using a shorthand notation:
+ *
+ * @snippet ex_config_parse.c nested get
+ *
+ * Nested configuration values can be traversed using multiple
+ * ::WT_CONFIG_PARSER handles:
+ *
+ * @snippet ex_config_parse.c nested traverse
+ */
+struct __wt_config_parser {
+
+ /*!
+ * Close the configuration scanner releasing any resources.
+ *
+ * @param config_parser the configuration parser handle
+ * @errors
+ *
+ */
+ int __F(close)(WT_CONFIG_PARSER *config_parser);
+
+ /*!
+ * Return the next key/value pair.
+ *
+ * When iteration would pass the end of the configuration string
+ * ::WT_NOTFOUND will be returned.
+ *
+ * If an item has no explicitly assigned value, the item will be
+ * returned in \c key and the \c value will be set to the boolean
+ * \c "true" value.
+ *
+ * @param config_parser the configuration parser handle
+ * @param key the returned key
+ * @param value the returned value
+ * @errors
+ *
+ */
+ int __F(next)(WT_CONFIG_PARSER *config_parser,
+ WT_CONFIG_ITEM *key, WT_CONFIG_ITEM *value);
+
+ /*!
+ * Return the value of an item in the configuration string.
+ *
+ * @param config_parser the configuration parser handle
+ * @param key configuration key string
+ * @param value the returned value
+ * @errors
+ *
+ */
+ int __F(get)(WT_CONFIG_PARSER *config_parser,
+ const char *key, WT_CONFIG_ITEM *value);
+};
+
+#endif /* !defined(SWIG) */
+/*! @} */
+
+/*!
+ * Get version information.
+ *
+ * @snippet ex_all.c Get the WiredTiger library version #1
+ * @snippet ex_all.c Get the WiredTiger library version #2
+ *
+ * @param majorp a location where the major version number is returned
+ * @param minorp a location where the minor version number is returned
+ * @param patchp a location where the patch version number is returned
+ * @returns a string representation of the version
+ */
+const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
+
+/*******************************************
+ * Error returns
+ *******************************************/
+/*!
+ * @anchor error_returns
+ * @name Error returns
+ * Most functions and methods in WiredTiger return an integer code indicating
+ * whether the operation succeeded or failed. A return of zero indicates
+ * success, all non-zero return values indicate some kind of failure.
+ *
+ * WiredTiger reserves all values from -31,800 to -31,999 as possible error
+ * return values. WiredTiger may also return C99/POSIX error codes such as
+ * \c ENOMEM, \c EINVAL and \c ENOTSUP, with the usual meanings.
+ *
+ * The following are all of the WiredTiger-specific error returns:
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/api_err.py.
+ * Error return section: BEGIN
+ */
+/*!
+ * Attempt to insert an existing key.
+ * This error is generated when the application attempts to insert a record with
+ * the same key as an existing record without the 'overwrite' configuration to
+ * WT_SESSION::open_cursor.
+ */
+#define WT_DUPLICATE_KEY -31800
+/*!
+ * Non-specific WiredTiger error.
+ * This error is returned when an error is not covered by a specific error
+ * return.
+ */
+#define WT_ERROR -31801
+/*!
+ * Item not found.
+ * This error indicates an operation did not find a value to return. This
+ * includes cursor search and other operations where no record matched the
+ * cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove.
+ */
+#define WT_NOTFOUND -31802
+/*!
+ * WiredTiger library panic.
+ * This error indicates an underlying problem that requires the application exit
+ * and restart.
+ */
+#define WT_PANIC -31803
+/*! @cond internal */
+/*! Restart the operation (internal). */
+#define WT_RESTART -31804
+/*! @endcond */
+/*!
+ * Conflict between concurrent operations.
+ * This error is generated when an operation cannot be completed due to a
+ * conflict with concurrent operations. The operation may be retried; if a
+ * transaction is in progress, it should be rolled back and the operation
+ * retried in a new transaction.
+ */
+#define WT_ROLLBACK -31805
+/*
+ * Error return section: END
+ * DO NOT EDIT: automatically built by dist/api_err.py.
+ */
+/*! @} */
+
+#ifndef DOXYGEN
+#define WT_DEADLOCK WT_ROLLBACK /* Backward compatibility */
+#endif
+
+/*! @} */
+
+/*!
+ * @defgroup wt_ext WiredTiger Extension API
+ * The functions and interfaces applications use to customize and extend the
+ * behavior of WiredTiger.
+ * @{
+ */
+
+/*******************************************
+ * Forward structure declarations for the extension API
+ *******************************************/
+struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG;
+
+/*!
+ * The interface implemented by applications to provide custom ordering of
+ * records.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_collator.
+ *
+ * @snippet ex_extending.c add collator nocase
+ *
+ * @snippet ex_extending.c add collator prefix10
+ */
+struct __wt_collator {
+ /*!
+ * Callback to compare keys.
+ *
+ * @param[out] cmp set to -1 if <code>key1 < key2</code>,
+ * 0 if <code>key1 == key2</code>,
+ * 1 if <code>key1 > key2</code>.
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet ex_all.c Implement WT_COLLATOR
+ *
+ * @snippet ex_extending.c case insensitive comparator
+ *
+ * @snippet ex_extending.c n character comparator
+ */
+ int (*compare)(WT_COLLATOR *collator, WT_SESSION *session,
+ const WT_ITEM *key1, const WT_ITEM *key2, int *cmp);
+
+ /*!
+ * If non-NULL, this callback is called to customize the collator
+ * for each data source. If the callback returns a non-NULL
+ * collator, that instance is used instead of this one for all
+ * comparisons.
+ */
+ int (*customize)(WT_COLLATOR *collator, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ITEM *appcfg, WT_COLLATOR **customp);
+
+ /*!
+ * If non-NULL, a callback performed when the database is closed.
+ *
+ * The WT_COLLATOR::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ */
+ int (*terminate)(WT_COLLATOR *collator, WT_SESSION *session);
+};
+
+/*!
+ * The interface implemented by applications to provide custom compression.
+ *
+ * Compressors must implement the WT_COMPRESSOR interface: the
+ * WT_COMPRESSOR::compress and WT_COMPRESSOR::decompress callbacks must be
+ * specified, and WT_COMPRESSOR::pre_size is optional. To build your own
+ * compressor, use one of the compressors in \c ext/compressors as a template:
+ * \c ext/nop_compress is a simple compressor that passes through data
+ * unchanged, and is a reasonable starting point.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_compressor.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ */
+struct __wt_compressor {
+ /*!
+ * Callback to compress a chunk of data.
+ *
+ * WT_COMPRESSOR::compress takes a source buffer and a destination
+ * buffer, by default of the same size. If the callback can compress
+ * the buffer to a smaller size in the destination, it does so, sets
+ * the \c compression_failed return to 0 and returns 0. If compression
+ * does not produce a smaller result, the callback sets the
+ * \c compression_failed return to 1 and returns 0. If another
+ * error occurs, it returns an errno or WiredTiger error code.
+ *
+ * On entry, \c src will point to memory, with the length of the memory
+ * in \c src_len. After successful completion, the callback should
+ * return \c 0 and set \c result_lenp to the number of bytes required
+ * for the compressed representation.
+ *
+ * On entry, \c dst points to the destination buffer with a length
+ * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified,
+ * the destination buffer will be at least the size returned by that
+ * method; otherwise, the destination buffer will be at least as large
+ * as \c src_len.
+ *
+ * If compression would not shrink the data or the \c dst buffer is not
+ * large enough to hold the compressed data, the callback should set
+ * \c compression_failed to a non-zero value and return 0.
+ *
+ * @param[in] src the data to compress
+ * @param[in] src_len the length of the data to compress
+ * @param[in] dst the destination buffer
+ * @param[in] dst_len the length of the destination buffer
+ * @param[out] result_lenp the length of the compressed data
+ * @param[out] compression_failed non-zero if compression did not
+ * decrease the length of the data (compression may not have completed)
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR compress
+ */
+ int (*compress)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed);
+
+ /*!
+ * Callback to compress a list of byte strings.
+ *
+ * WT_COMPRESSOR::compress_raw gives applications fine-grained control
+ * over disk block size when writing row-store or variable-length
+ * column-store pages. Where this level of control is not required by
+ * the underlying storage device, set the WT_COMPRESSOR::compress_raw
+ * callback to \c NULL and WiredTiger will internally split each page
+ * into blocks, each block then compressed by WT_COMPRESSOR::compress.
+ *
+ * WT_COMPRESSOR::compress_raw takes a source buffer and an array of
+ * 0-based offsets of byte strings in that buffer. The callback then
+ * encodes none, some or all of the byte strings and copies the encoded
+ * representation into a destination buffer. The callback returns the
+ * number of byte strings encoded and the bytes needed for the encoded
+ * representation. The encoded representation has header information
+ * prepended and is written as a block to the underlying file object.
+ *
+ * On entry, \c page_max is the configured maximum size for objects of
+ * this type. (This value is provided for convenience, and will be
+ * either the \c internal_page_max or \c leaf_page_max value specified
+ * to WT_SESSION::create when the object was created.)
+ *
+ * On entry, \c split_pct is the configured Btree page split size for
+ * this object. (This value is provided for convenience, and will be
+ * the \c split_pct value specified to WT_SESSION::create when the
+ * object was created.)
+ *
+ * On entry, \c extra is a count of additional bytes that will be added
+ * to the encoded representation before it is written. In other words,
+ * if the target write size is 8KB, the returned encoded representation
+ * should be less than or equal to (8KB - \c extra). The method does
+ * not need to skip bytes in the destination buffer based on \c extra,
+ * the method should only use \c extra to decide how many bytes to store
+ * into the destination buffer for its ideal block size.
+ *
+ * On entry, \c src points to the source buffer; \c offsets is an array
+ * of \c slots 0-based offsets into \c src, where each offset is the
+ * start of a byte string, except for the last offset, which is the
+ * offset of the first byte past the end of the last byte string. (In
+ * other words, <code>offsets[0]</code> will be 0, the offset of the
+ * first byte of the first byte string in \c src, and
+ * <code>offsets[slots]</code> is the total length of all of the byte
+ * strings in the \c src buffer.)
+ *
+ * On entry, \c dst points to the destination buffer with a length
+ * of \c dst_len. If the WT_COMPRESSOR::pre_size method is specified,
+ * the destination buffer will be at least the size returned by that
+ * method; otherwise, the destination buffer will be at least the
+ * maximum size for the page being written (that is, when writing a
+ * row-store leaf page, the destination buffer will be at least as
+ * large as the \c leaf_page_max configuration value).
+ *
+ * After successful completion, the callback should return \c 0, and
+ * set \c result_slotsp to the number of byte strings encoded and
+ * \c result_lenp to the bytes needed for the encoded representation.
+ *
+ * There is no requirement the callback encode any or all of the byte
+ * strings passed by WiredTiger. If the callback does not encode any
+ * of the byte strings and compression should not be retried, the
+ * callback should set \c result_slotsp to 0.
+ *
+ * If the callback does not encode any of the byte strings and
+ * compression should be retried with additional byte strings, the
+ * callback must return \c EAGAIN. In that case, WiredTiger will
+ * accumulate more rows and repeat the call.
+ *
+ * If there are no more rows to accumulate or the callback indicates
+ * that it cannot be retried, WiredTiger writes the remaining rows
+ * using \c WT_COMPRESSOR::compress.
+ *
+ * On entry, \c final is zero if there are more rows to be written as
+ * part of this page (if there will be additional data provided to the
+ * callback), and non-zero if there are no more rows to be written as
+ * part of this page. If \c final is set and the callback fails to
+ * encode any rows, WiredTiger writes the remaining rows without further
+ * calls to the callback. If \c final is set and the callback encodes
+ * any number of rows, WiredTiger continues to call the callback until
+ * all of the rows are encoded or the callback fails to encode any rows.
+ *
+ * The WT_COMPRESSOR::compress_raw callback is intended for applications
+ * wanting to create disk blocks in specific sizes.
+ * WT_COMPRESSOR::compress_raw is not a replacement for
+ * WT_COMPRESSOR::compress: objects which WT_COMPRESSOR::compress_raw
+ * cannot handle (for example, overflow key or value items), or which
+ * WT_COMPRESSOR::compress_raw chooses not to compress for any reason
+ * (for example, if WT_COMPRESSOR::compress_raw callback chooses not to
+ * compress a small number of rows, but the page being written has no
+ * more rows to accumulate), will be passed to WT_COMPRESSOR::compress.
+ *
+ * The WT_COMPRESSOR::compress_raw callback is only called for objects
+ * where it is applicable, that is, for row-store and variable-length
+ * column-store objects, where both row-store key prefix compression
+ * and row-store and variable-length column-store dictionary compression
+ * are \b not configured. When WT_COMPRESSOR::compress_raw is not
+ * applicable, the WT_COMPRESSOR::compress callback is used instead.
+ *
+ * @param[in] page_max the configured maximum page size for this object
+ * @param[in] split_pct the configured page split size for this object
+ * @param[in] extra the count of the additional bytes
+ * @param[in] src the data to compress
+ * @param[in] offsets the byte offsets of the byte strings in src
+ * @param[in] slots the number of entries in offsets
+ * @param[in] dst the destination buffer
+ * @param[in] dst_len the length of the destination buffer
+ * @param[in] final non-zero if there are no more rows to accumulate
+ * @param[out] result_lenp the length of the compressed data
+ * @param[out] result_slotsp the number of byte offsets taken
+ * @returns zero for success, non-zero to indicate an error.
+ */
+ int (*compress_raw)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ size_t page_max, int split_pct, size_t extra,
+ uint8_t *src, uint32_t *offsets, uint32_t slots,
+ uint8_t *dst, size_t dst_len,
+ int final,
+ size_t *result_lenp, uint32_t *result_slotsp);
+
+ /*!
+ * Callback to decompress a chunk of data.
+ *
+ * WT_COMPRESSOR::decompress takes a source buffer and a destination
+ * buffer. The contents are switched from \c compress: the
+ * source buffer is the compressed value, and the destination buffer is
+ * sized to be the original size. If the callback successfully
+ * decompresses the source buffer to the destination buffer, it returns
+ * 0. If an error occurs, it returns an errno or WiredTiger error code.
+ * The source buffer that WT_COMPRESSOR::decompress takes may have a
+ * size that is rounded up from the size originally produced by
+ * WT_COMPRESSOR::compress, with the remainder of the buffer set to
+ * zeroes. Most compressors do not care about this difference if the
+ * size to be decompressed can be implicitly discovered from the
+ * compressed data. If your compressor cares, you may need to allocate
+ * space for, and store, the actual size in the compressed buffer. See
+ * the source code for the included snappy compressor for an example.
+ *
+ * On entry, \c src will point to memory, with the length of the memory
+ * in \c src_len. After successful completion, the callback should
+ * return \c 0 and set \c result_lenp to the number of bytes required
+ * for the decompressed representation.
+ *
+ * If the \c dst buffer is not big enough to hold the decompressed
+ * data, the callback should return an error.
+ *
+ * @param[in] src the data to decompress
+ * @param[in] src_len the length of the data to decompress
+ * @param[in] dst the destination buffer
+ * @param[in] dst_len the length of the destination buffer
+ * @param[out] result_lenp the length of the decompressed data
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR decompress
+ */
+ int (*decompress)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp);
+
+ /*!
+ * Callback to size a destination buffer for compression
+ *
+ * WT_COMPRESSOR::pre_size is an optional callback that, given the
+ * source buffer and size, produces the size of the destination buffer
+ * to be given to WT_COMPRESSOR::compress. This is useful for
+ * compressors that assume that the output buffer is sized for the
+ * worst case and thus no overrun checks are made. If your compressor
+ * works like this, WT_COMPRESSOR::pre_size will need to be defined.
+ * See the source code for the snappy compressor for an example.
+ * However, if your compressor detects and avoids overruns against its
+ * target buffer, you will not need to define WT_COMPRESSOR::pre_size.
+ * When WT_COMPRESSOR::pre_size is set to NULL, the destination buffer
+ * is sized the same as the source buffer. This is always sufficient,
+ * since a compression result that is larger than the source buffer is
+ * discarded by WiredTiger.
+ *
+ * If not NULL, this callback is called before each call to
+ * WT_COMPRESS::compress to determine the size of the destination
+ * buffer to provide. If the callback is NULL, the destination
+ * buffer will be the same size as the source buffer.
+ *
+ * The callback should set \c result_lenp to a suitable buffer size
+ * for compression, typically the maximum length required by
+ * WT_COMPRESSOR::compress.
+ *
+ * This callback function is for compressors that require an output
+ * buffer larger than the source buffer (for example, that do not
+ * check for buffer overflow during compression).
+ *
+ * @param[in] src the data to compress
+ * @param[in] src_len the length of the data to compress
+ * @param[out] result_lenp the required destination buffer size
+ * @returns zero for success, non-zero to indicate an error.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR presize
+ */
+ int (*pre_size)(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len, size_t *result_lenp);
+
+ /*!
+ * If non-NULL, a callback performed when the database is closed.
+ *
+ * The WT_COMPRESSOR::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR terminate
+ */
+ int (*terminate)(WT_COMPRESSOR *compressor, WT_SESSION *session);
+};
+
+/*!
+ * Applications can extend WiredTiger by providing new implementations of the
+ * WT_DATA_SOURCE class. Each data source supports a different URI scheme for
+ * data sources to WT_SESSION::create, WT_SESSION::open_cursor and related
+ * methods. See @ref custom_data_sources for more information.
+ *
+ * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_DATA_SOURCE
+ * interface from multiple threads concurrently. It is the responsibility of
+ * the implementation to protect any shared data.
+ *
+ * Applications register their implementation with WiredTiger by calling
+ * WT_CONNECTION::add_data_source.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE register
+ */
+struct __wt_data_source {
+ /*!
+ * Callback to create a new object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE create
+ */
+ int (*create)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to compact an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE compact
+ */
+ int (*compact)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to drop an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE drop
+ */
+ int (*drop)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to initialize a cursor.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE open_cursor
+ */
+ int (*open_cursor)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor);
+
+ /*!
+ * Callback to rename an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE rename
+ */
+ int (*rename)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, const char *newuri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to salvage an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE salvage
+ */
+ int (*salvage)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to truncate an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE truncate
+ */
+ int (*truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to truncate a range of an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE range truncate
+ */
+ int (*range_truncate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ WT_CURSOR *start, WT_CURSOR *stop);
+
+ /*!
+ * Callback to verify an object.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE verify
+ */
+ int (*verify)(WT_DATA_SOURCE *dsrc, WT_SESSION *session,
+ const char *uri, WT_CONFIG_ARG *config);
+
+ /*!
+ * Callback to checkpoint the database.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE checkpoint
+ */
+ int (*checkpoint)(
+ WT_DATA_SOURCE *dsrc, WT_SESSION *session, WT_CONFIG_ARG *config);
+
+ /*!
+ * If non-NULL, a callback performed when the database is closed.
+ *
+ * The WT_DATA_SOURCE::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ *
+ * @snippet ex_data_source.c WT_DATA_SOURCE terminate
+ */
+ int (*terminate)(WT_DATA_SOURCE *dsrc, WT_SESSION *session);
+};
+
+/*!
+ * The interface implemented by applications to provide custom extraction of
+ * index keys or column group values.
+ *
+ * Applications register implementations with WiredTiger by calling
+ * WT_CONNECTION::add_extractor.
+ *
+ * @snippet ex_all.c WT_EXTRACTOR register
+ */
+struct __wt_extractor {
+ /*!
+ * Callback to extract a value for an index or column group.
+ *
+ * @errors
+ *
+ * @snippet ex_all.c WT_EXTRACTOR
+ */
+ int (*extract)(WT_EXTRACTOR *extractor, WT_SESSION *session,
+ const WT_ITEM *key, const WT_ITEM *value, WT_ITEM *result);
+};
+
+/*!
+ * Entry point to an extension, called when the extension is loaded.
+ *
+ * @param connection the connection handle
+ * @param config the config information passed to WT_CONNECTION::load_extension
+ * @errors
+ */
+extern int wiredtiger_extension_init(
+ WT_CONNECTION *connection, WT_CONFIG_ARG *config);
+
+/*!
+ * Optional cleanup function for an extension, called during
+ * WT_CONNECTION::close.
+ *
+ * @param connection the connection handle
+ * @errors
+ */
+extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
+
+/*! @} */
+
+/*******************************************
+ * Statistic reference.
+ *******************************************/
+/*!
+ * @addtogroup wt
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/api_stat.py.
+ * Statistics section: BEGIN
+ */
+
+/*!
+ * @name Connection statistics
+ * @anchor statistics_keys
+ * @anchor statistics_conn
+ * Statistics are accessed through cursors with \c "statistics:" URIs.
+ * Individual statistics can be queried through the cursor using the following
+ * keys. See @ref data_statistics for more information.
+ * @{
+ */
+/*! async: number of allocation state races */
+#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1000
+/*! async: number of op slots viewed for alloc */
+#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1001
+/*! async: current work queue length */
+#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1002
+/*! async: number of async flush calls */
+#define WT_STAT_CONN_ASYNC_FLUSH 1003
+/*! async: number of times op allocation failed */
+#define WT_STAT_CONN_ASYNC_FULL 1004
+/*! async: maximum work queue length */
+#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1005
+/*! async: number of times worker found no work */
+#define WT_STAT_CONN_ASYNC_NOWORK 1006
+/*! async: op allocations */
+#define WT_STAT_CONN_ASYNC_OP_ALLOC 1007
+/*! async: op compact calls */
+#define WT_STAT_CONN_ASYNC_OP_COMPACT 1008
+/*! async: op insert calls */
+#define WT_STAT_CONN_ASYNC_OP_INSERT 1009
+/*! async: op remove calls */
+#define WT_STAT_CONN_ASYNC_OP_REMOVE 1010
+/*! async: op search calls */
+#define WT_STAT_CONN_ASYNC_OP_SEARCH 1011
+/*! async: op update calls */
+#define WT_STAT_CONN_ASYNC_OP_UPDATE 1012
+/*! block manager: mapped bytes read */
+#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1013
+/*! block manager: bytes read */
+#define WT_STAT_CONN_BLOCK_BYTE_READ 1014
+/*! block manager: bytes written */
+#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1015
+/*! block manager: mapped blocks read */
+#define WT_STAT_CONN_BLOCK_MAP_READ 1016
+/*! block manager: blocks pre-loaded */
+#define WT_STAT_CONN_BLOCK_PRELOAD 1017
+/*! block manager: blocks read */
+#define WT_STAT_CONN_BLOCK_READ 1018
+/*! block manager: blocks written */
+#define WT_STAT_CONN_BLOCK_WRITE 1019
+/*! cache: tracked dirty bytes in the cache */
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1020
+/*! cache: bytes currently in the cache */
+#define WT_STAT_CONN_CACHE_BYTES_INUSE 1021
+/*! cache: maximum bytes configured */
+#define WT_STAT_CONN_CACHE_BYTES_MAX 1022
+/*! cache: bytes read into cache */
+#define WT_STAT_CONN_CACHE_BYTES_READ 1023
+/*! cache: bytes written from cache */
+#define WT_STAT_CONN_CACHE_BYTES_WRITE 1024
+/*! cache: checkpoint blocked page eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1025
+/*! cache: unmodified pages evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1026
+/*! cache: page split during eviction deepened the tree */
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1027
+/*! cache: modified pages evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1028
+/*! cache: pages selected for eviction unable to be evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1029
+/*! cache: pages evicted because they exceeded the in-memory maximum */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1030
+/*! cache: failed eviction of pages that exceeded the in-memory maximum */
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1031
+/*! cache: hazard pointer blocked page eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1032
+/*! cache: internal pages evicted */
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1033
+/*! cache: eviction server candidate queue empty when topping up */
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1034
+/*! cache: eviction server candidate queue not empty when topping up */
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1035
+/*! cache: eviction server evicting pages */
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1036
+/*! cache: eviction server populating queue, but not evicting pages */
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1037
+/*! cache: eviction server unable to reach eviction goal */
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1038
+/*! cache: pages split during eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1039
+/*! cache: pages walked for eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1040
+/*! cache: tracked dirty pages in the cache */
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1041
+/*! cache: pages currently held in the cache */
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1042
+/*! cache: pages read into cache */
+#define WT_STAT_CONN_CACHE_READ 1043
+/*! cache: pages written from cache */
+#define WT_STAT_CONN_CACHE_WRITE 1044
+/*! conn: pthread mutex condition wait calls */
+#define WT_STAT_CONN_COND_WAIT 1045
+/*! Btree: cursor create calls */
+#define WT_STAT_CONN_CURSOR_CREATE 1046
+/*! Btree: cursor insert calls */
+#define WT_STAT_CONN_CURSOR_INSERT 1047
+/*! Btree: cursor next calls */
+#define WT_STAT_CONN_CURSOR_NEXT 1048
+/*! Btree: cursor prev calls */
+#define WT_STAT_CONN_CURSOR_PREV 1049
+/*! Btree: cursor remove calls */
+#define WT_STAT_CONN_CURSOR_REMOVE 1050
+/*! Btree: cursor reset calls */
+#define WT_STAT_CONN_CURSOR_RESET 1051
+/*! Btree: cursor search calls */
+#define WT_STAT_CONN_CURSOR_SEARCH 1052
+/*! Btree: cursor search near calls */
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1053
+/*! Btree: cursor update calls */
+#define WT_STAT_CONN_CURSOR_UPDATE 1054
+/*! dhandle: session dhandles swept */
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1055
+/*! dhandle: session sweep attempts */
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1056
+/*! conn: files currently open */
+#define WT_STAT_CONN_FILE_OPEN 1057
+/*! log: log buffer size increases */
+#define WT_STAT_CONN_LOG_BUFFER_GROW 1058
+/*! log: total log buffer size */
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1059
+/*! log: user provided log bytes written */
+#define WT_STAT_CONN_LOG_BYTES_USER 1060
+/*! log: log bytes written */
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1061
+/*! log: yields waiting for previous log file close */
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1062
+/*! log: maximum log file size */
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1063
+/*! log: log read operations */
+#define WT_STAT_CONN_LOG_READS 1064
+/*! log: records processed by log scan */
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1065
+/*! log: log scan records requiring two reads */
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1066
+/*! log: log scan operations */
+#define WT_STAT_CONN_LOG_SCANS 1067
+/*! log: consolidated slot closures */
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1068
+/*! log: logging bytes consolidated */
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1069
+/*! log: consolidated slot joins */
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1070
+/*! log: consolidated slot join races */
+#define WT_STAT_CONN_LOG_SLOT_RACES 1071
+/*! log: slots selected for switching that were unavailable */
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_FAILS 1072
+/*! log: record size exceeded maximum */
+#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1073
+/*! log: failed to find a slot large enough for record */
+#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1074
+/*! log: consolidated slot join transitions */
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1075
+/*! log: log sync operations */
+#define WT_STAT_CONN_LOG_SYNC 1076
+/*! log: log write operations */
+#define WT_STAT_CONN_LOG_WRITES 1077
+/*! LSM: sleep for LSM checkpoint throttle */
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1078
+/*! LSM: sleep for LSM merge throttle */
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1079
+/*! LSM: rows merged in an LSM tree */
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1080
+/*! LSM: App work units currently queued */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1081
+/*! LSM: Merge work units currently queued */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1082
+/*! LSM: tree queue hit maximum */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1083
+/*! LSM: Switch work units currently queued */
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1084
+/*! LSM: tree maintenance operations scheduled */
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1085
+/*! LSM: tree maintenance operations discarded */
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1086
+/*! LSM: tree maintenance operations executed */
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1087
+/*! conn: memory allocations */
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1088
+/*! conn: memory frees */
+#define WT_STAT_CONN_MEMORY_FREE 1089
+/*! conn: memory re-allocations */
+#define WT_STAT_CONN_MEMORY_GROW 1090
+/*! conn: total read I/Os */
+#define WT_STAT_CONN_READ_IO 1091
+/*! reconciliation: page reconciliation calls */
+#define WT_STAT_CONN_REC_PAGES 1092
+/*! reconciliation: page reconciliation calls for eviction */
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1093
+/*! reconciliation: split bytes currently awaiting free */
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1094
+/*! reconciliation: split objects currently awaiting free */
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1095
+/*! conn: pthread mutex shared lock read-lock calls */
+#define WT_STAT_CONN_RWLOCK_READ 1096
+/*! conn: pthread mutex shared lock write-lock calls */
+#define WT_STAT_CONN_RWLOCK_WRITE 1097
+/*! session: open cursor count */
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1098
+/*! session: open session count */
+#define WT_STAT_CONN_SESSION_OPEN 1099
+/*! txn: transaction begins */
+#define WT_STAT_CONN_TXN_BEGIN 1100
+/*! txn: transaction checkpoints */
+#define WT_STAT_CONN_TXN_CHECKPOINT 1101
+/*! txn: transaction checkpoint currently running */
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1102
+/*! txn: transactions committed */
+#define WT_STAT_CONN_TXN_COMMIT 1103
+/*! txn: transaction failures due to cache overflow */
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1104
+/*! txn: transaction range of IDs currently pinned */
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1105
+/*! txn: transactions rolled back */
+#define WT_STAT_CONN_TXN_ROLLBACK 1106
+/*! conn: total write I/Os */
+#define WT_STAT_CONN_WRITE_IO 1107
+
+/*!
+ * @}
+ * @name Statistics for data sources
+ * @anchor statistics_dsrc
+ * @{
+ */
+/*! block manager: file allocation unit size */
+#define WT_STAT_DSRC_ALLOCATION_SIZE 2000
+/*! block manager: blocks allocated */
+#define WT_STAT_DSRC_BLOCK_ALLOC 2001
+/*! block manager: checkpoint size */
+#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2002
+/*! block manager: allocations requiring file extension */
+#define WT_STAT_DSRC_BLOCK_EXTENSION 2003
+/*! block manager: blocks freed */
+#define WT_STAT_DSRC_BLOCK_FREE 2004
+/*! block manager: file magic number */
+#define WT_STAT_DSRC_BLOCK_MAGIC 2005
+/*! block manager: file major version number */
+#define WT_STAT_DSRC_BLOCK_MAJOR 2006
+/*! block manager: minor version number */
+#define WT_STAT_DSRC_BLOCK_MINOR 2007
+/*! block manager: file bytes available for reuse */
+#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2008
+/*! block manager: file size in bytes */
+#define WT_STAT_DSRC_BLOCK_SIZE 2009
+/*! LSM: bloom filters in the LSM tree */
+#define WT_STAT_DSRC_BLOOM_COUNT 2010
+/*! LSM: bloom filter false positives */
+#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2011
+/*! LSM: bloom filter hits */
+#define WT_STAT_DSRC_BLOOM_HIT 2012
+/*! LSM: bloom filter misses */
+#define WT_STAT_DSRC_BLOOM_MISS 2013
+/*! LSM: bloom filter pages evicted from cache */
+#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2014
+/*! LSM: bloom filter pages read into cache */
+#define WT_STAT_DSRC_BLOOM_PAGE_READ 2015
+/*! LSM: total size of bloom filters */
+#define WT_STAT_DSRC_BLOOM_SIZE 2016
+/*! btree: column-store variable-size deleted values */
+#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2017
+/*! btree: column-store fixed-size leaf pages */
+#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2018
+/*! btree: column-store internal pages */
+#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2019
+/*! btree: column-store variable-size leaf pages */
+#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2020
+/*! btree: pages rewritten by compaction */
+#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2021
+/*! btree: number of key/value pairs */
+#define WT_STAT_DSRC_BTREE_ENTRIES 2022
+/*! btree: fixed-record size */
+#define WT_STAT_DSRC_BTREE_FIXED_LEN 2023
+/*! btree: maximum tree depth */
+#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2024
+/*! btree: maximum internal page item size */
+#define WT_STAT_DSRC_BTREE_MAXINTLITEM 2025
+/*! btree: maximum internal page size */
+#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2026
+/*! btree: maximum leaf page item size */
+#define WT_STAT_DSRC_BTREE_MAXLEAFITEM 2027
+/*! btree: maximum leaf page size */
+#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2028
+/*! btree: overflow pages */
+#define WT_STAT_DSRC_BTREE_OVERFLOW 2029
+/*! btree: row-store internal pages */
+#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2030
+/*! btree: row-store leaf pages */
+#define WT_STAT_DSRC_BTREE_ROW_LEAF 2031
+/*! cache: bytes read into cache */
+#define WT_STAT_DSRC_CACHE_BYTES_READ 2032
+/*! cache: bytes written from cache */
+#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2033
+/*! cache: checkpoint blocked page eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2034
+/*! cache: unmodified pages evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2035
+/*! cache: modified pages evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2036
+/*! cache: data source pages selected for eviction unable to be evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2037
+/*! cache: hazard pointer blocked page eviction */
+#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2038
+/*! cache: internal pages evicted */
+#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2039
+/*! cache: overflow values cached in memory */
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2040
+/*! cache: pages read into cache */
+#define WT_STAT_DSRC_CACHE_READ 2041
+/*! cache: overflow pages read into cache */
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2042
+/*! cache: pages written from cache */
+#define WT_STAT_DSRC_CACHE_WRITE 2043
+/*! compression: raw compression call failed, no additional data available */
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2044
+/*! compression: raw compression call failed, additional data available */
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2045
+/*! compression: raw compression call succeeded */
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2046
+/*! compression: compressed pages read */
+#define WT_STAT_DSRC_COMPRESS_READ 2047
+/*! compression: compressed pages written */
+#define WT_STAT_DSRC_COMPRESS_WRITE 2048
+/*! compression: page written failed to compress */
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2049
+/*! compression: page written was too small to compress */
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2050
+/*! cursor: create calls */
+#define WT_STAT_DSRC_CURSOR_CREATE 2051
+/*! cursor: insert calls */
+#define WT_STAT_DSRC_CURSOR_INSERT 2052
+/*! cursor: bulk-loaded cursor-insert calls */
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2053
+/*! cursor: cursor-insert key and value bytes inserted */
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2054
+/*! cursor: next calls */
+#define WT_STAT_DSRC_CURSOR_NEXT 2055
+/*! cursor: prev calls */
+#define WT_STAT_DSRC_CURSOR_PREV 2056
+/*! cursor: remove calls */
+#define WT_STAT_DSRC_CURSOR_REMOVE 2057
+/*! cursor: cursor-remove key bytes removed */
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2058
+/*! cursor: reset calls */
+#define WT_STAT_DSRC_CURSOR_RESET 2059
+/*! cursor: search calls */
+#define WT_STAT_DSRC_CURSOR_SEARCH 2060
+/*! cursor: search near calls */
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2061
+/*! cursor: update calls */
+#define WT_STAT_DSRC_CURSOR_UPDATE 2062
+/*! cursor: cursor-update value bytes updated */
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2063
+/*! LSM: sleep for LSM checkpoint throttle */
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2064
+/*! LSM: chunks in the LSM tree */
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2065
+/*! LSM: highest merge generation in the LSM tree */
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2066
+/*! LSM: queries that could have benefited from a Bloom filter that did
+ * not exist */
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2067
+/*! LSM: sleep for LSM merge throttle */
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2068
+/*! reconciliation: dictionary matches */
+#define WT_STAT_DSRC_REC_DICTIONARY 2069
+/*! reconciliation: internal page multi-block writes */
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2070
+/*! reconciliation: leaf page multi-block writes */
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2071
+/*! reconciliation: maximum blocks required for a page */
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2072
+/*! reconciliation: internal-page overflow keys */
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2073
+/*! reconciliation: leaf-page overflow keys */
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2074
+/*! reconciliation: overflow values written */
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2075
+/*! reconciliation: pages deleted */
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2076
+/*! reconciliation: page checksum matches */
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2077
+/*! reconciliation: page reconciliation calls */
+#define WT_STAT_DSRC_REC_PAGES 2078
+/*! reconciliation: page reconciliation calls for eviction */
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2079
+/*! reconciliation: leaf page key bytes discarded using prefix compression */
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2080
+/*! reconciliation: internal page key bytes discarded using suffix
+ * compression */
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2081
+/*! session: object compaction */
+#define WT_STAT_DSRC_SESSION_COMPACT 2082
+/*! session: open cursor count */
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2083
+/*! txn: update conflicts */
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2084
+/*! @} */
+/*
+ * Statistics section: END
+ * DO NOT EDIT: automatically built by dist/api_stat.py.
+ */
+/*!
+ * @name Log record and operation types
+ * @anchor log_types
+ * @{
+ */
+/*
+ * DO NOT EDIT: automatically built by dist/log.py.
+ * Log record declarations: BEGIN
+ */
+/*! invalid operation */
+#define WT_LOGOP_INVALID 0
+/*! checkpoint */
+#define WT_LOGREC_CHECKPOINT 0
+/*! transaction commit */
+#define WT_LOGREC_COMMIT 1
+/*! file sync */
+#define WT_LOGREC_FILE_SYNC 2
+/*! message */
+#define WT_LOGREC_MESSAGE 3
+/*! column put */
+#define WT_LOGOP_COL_PUT 1
+/*! column remove */
+#define WT_LOGOP_COL_REMOVE 2
+/*! column truncate */
+#define WT_LOGOP_COL_TRUNCATE 3
+/*! row put */
+#define WT_LOGOP_ROW_PUT 4
+/*! row remove */
+#define WT_LOGOP_ROW_REMOVE 5
+/*! row truncate */
+#define WT_LOGOP_ROW_TRUNCATE 6
+/*
+ * Log record declarations: END
+ * DO NOT EDIT: automatically built by dist/log.py.
+ */
+/*! @} */
+/*! @} */
+
+#undef __F
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __WIREDTIGER_H_ */
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger_ext.h b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h
new file mode 100644
index 00000000000..fd0282cd50c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wiredtiger_ext.h
@@ -0,0 +1,398 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#ifndef __WIREDTIGER_EXT_H_
+#define __WIREDTIGER_EXT_H_
+
+#include <wiredtiger.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if !defined(SWIG)
+
+/*!
+ * @addtogroup wt_ext
+ * @{
+ */
+
+/*!
+ * Read-committed isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define WT_TXN_ISO_READ_COMMITTED 1
+/*!
+ * Read-uncommitted isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define WT_TXN_ISO_READ_UNCOMMITTED 2
+/*!
+ * Snapshot isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+#define WT_TXN_ISO_SNAPSHOT 3
+
+typedef struct __wt_txn_notify WT_TXN_NOTIFY;
+/*!
+ * Snapshot isolation level, returned by
+ * WT_EXTENSION_API::transaction_isolation_level.
+ */
+struct __wt_txn_notify {
+ /*!
+ * A method called when the session's current transaction is committed
+ * or rolled back.
+ *
+ * @param notify a pointer to the event handler
+ * @param session the current session handle
+ * @param txnid the transaction ID
+ * @param committed an integer value which is non-zero if the
+ * transaction is being committed.
+ */
+ int (*notify)(WT_TXN_NOTIFY *notify, WT_SESSION *session,
+ uint64_t txnid, int committed);
+};
+
+/*!
+ * Table of WiredTiger extension methods.
+ *
+ * This structure is used to provide a set of WiredTiger methods to extension
+ * modules without needing to link the modules with the WiredTiger library.
+ *
+ * The extension methods may be used both by modules that are linked with
+ * the WiredTiger library (for example, a data source configured using the
+ * WT_CONNECTION::add_data_source method), and by modules not linked with the
+ * WiredTiger library (for example, a compression module configured using the
+ * WT_CONNECTION::add_compressor method).
+ *
+ * To use these functions:
+ * - include the wiredtiger_ext.h header file,
+ * - declare a variable which references a WT_EXTENSION_API structure, and
+ * - initialize the variable using WT_CONNECTION::get_extension_api method.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API declaration
+ *
+ * The following code is from the sample compression module, where compression
+ * extension functions are configured in the extension's entry point:
+ *
+ * @snippet nop_compress.c WT_COMPRESSOR initialization structure
+ * @snippet nop_compress.c WT_COMPRESSOR initialization function
+ */
+struct __wt_extension_api {
+/* !!! To maintain backwards compatibility, this structure is append-only. */
+#if !defined(DOXYGEN)
+ /*
+ * Private fields.
+ */
+ WT_CONNECTION *conn; /* Enclosing connection */
+#endif
+ /*!
+ * Insert an error message into the WiredTiger error stream.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param fmt a printf-like format specification
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API err_printf
+ */
+ int (*err_printf)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *fmt, ...);
+
+ /*!
+ * Insert a message into the WiredTiger message stream.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param fmt a printf-like format specification
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API msg_printf
+ */
+ int (*msg_printf)(
+ WT_EXTENSION_API *, WT_SESSION *session, const char *fmt, ...);
+
+ /*!
+ * Return information about an error as a string; the strerror method
+ * is a superset of the ISO C99/POSIX 1003.1-2001 function strerror.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API strerror
+ *
+ * @param err a return value from a WiredTiger, C library or POSIX
+ * function
+ * @returns a string representation of the error
+ */
+ const char *(*strerror)(int err);
+
+ /*!
+ * Allocate short-term use scratch memory.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param bytes the number of bytes of memory needed
+ * @returns A valid memory reference on success or NULL on error
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API scr_alloc
+ */
+ void *(*scr_alloc)(
+ WT_EXTENSION_API *wt_api, WT_SESSION *session, size_t bytes);
+
+ /*!
+ * Free short-term use scratch memory.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param ref a memory reference returned by WT_EXTENSION_API::scr_alloc
+ *
+ * @snippet ex_data_source.c WT_EXTENSION_API scr_free
+ */
+ void (*scr_free)(WT_EXTENSION_API *, WT_SESSION *session, void *ref);
+
+ /*!
+ * Configure the extension collator method.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param config the configuration information passed to an application
+ * @param collatorp the selector collator, if any
+ * @param ownp set if the collator terminate method should be called
+ * when no longer needed
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION collator config
+ */
+ int (*collator_config)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ WT_CONFIG_ARG *config, WT_COLLATOR **collatorp, int *ownp);
+
+ /*!
+ * The extension collator method.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param collator the collator (or NULL if none available)
+ * @param first first item
+ * @param second second item
+ * @param[out] cmp set less than 0 if \c first collates less than
+ * \c second, set equal to 0 if \c first collates equally to \c second,
+ * set greater than 0 if \c first collates greater than \c second
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION collate
+ */
+ int (*collate)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ WT_COLLATOR *collator, WT_ITEM *first, WT_ITEM *second, int *cmp);
+
+ /*!
+ * @copydoc wiredtiger_config_parser_open
+ */
+ int (*config_parser_open)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ const char *config, size_t len, WT_CONFIG_PARSER **config_parserp);
+
+ /*!
+ * Return the value of a configuration string.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key configuration key string
+ * @param config the configuration information passed to an application
+ * @param value the returned value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION config_get
+ */
+ int (*config_get)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ WT_CONFIG_ARG *config, const char *key, WT_CONFIG_ITEM *value);
+
+ /*!
+ * Insert a row into the metadata if it does not already exist.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @param value row value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata insert
+ */
+ int (*metadata_insert)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *key, const char *value);
+
+ /*!
+ * Remove a row from the metadata.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata remove
+ */
+ int (*metadata_remove)(
+ WT_EXTENSION_API *wt_api, WT_SESSION *session, const char *key);
+
+ /*!
+ * Return a row from the metadata.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @param [out] valuep the row value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata search
+ */
+ int (*metadata_search)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *key, const char **valuep);
+
+ /*!
+ * Update a row in the metadata by either inserting a new record or
+ * updating an existing record.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle (or NULL if none available)
+ * @param key row key
+ * @param value row value
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION metadata update
+ */
+ int (*metadata_update)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, const char *key, const char *value);
+
+ /*!
+ * Pack a structure into a buffer.
+ * See ::wiredtiger_struct_pack for details.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+ int (*struct_pack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ void *buffer, size_t size, const char *format, ...);
+
+ /*!
+ * Calculate the size required to pack a structure.
+ * See ::wiredtiger_struct_size for details.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param sizep a location where the number of bytes needed for the
+ * matching call to WT_EXTENSION_API::struct_pack is returned
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+ int (*struct_size)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ size_t *sizep, const char *format, ...);
+
+ /*!
+ * Unpack a structure from a buffer.
+ * See ::wiredtiger_struct_unpack for details.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param buffer a pointer to a packed byte array
+ * @param size the number of valid bytes in the buffer
+ * @param format the data format, see @ref packing
+ * @errors
+ */
+ int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session,
+ const void *buffer, size_t size, const char *format, ...);
+
+ /*!
+ * Return the current transaction ID.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @returns the current transaction ID.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction ID
+ */
+ uint64_t (*transaction_id)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session);
+
+ /*!
+ * Return the current transaction's isolation level; returns one of
+ * ::WT_TXN_ISO_READ_COMMITTED, ::WT_TXN_ISO_READ_UNCOMMITTED, or
+ * ::WT_TXN_ISO_SNAPSHOT.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @returns the current transaction's isolation level.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction isolation level
+ */
+ int (*transaction_isolation_level)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session);
+
+ /*!
+ * Request notification of transaction resolution by specifying a
+ * function to be called when the session's current transaction is
+ * either committed or rolled back. If the transaction is being
+ * committed, but the notification function returns an error, the
+ * transaction will be rolled back.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param notify a handler for commit or rollback events
+ * @errors
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction notify
+ */
+ int (*transaction_notify)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, WT_TXN_NOTIFY *notify);
+
+ /*!
+ * Return the oldest transaction ID not yet visible to a running
+ * transaction.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @returns the oldest transaction ID not yet visible to a running
+ * transaction.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction oldest
+ */
+ uint64_t (*transaction_oldest)(WT_EXTENSION_API *wt_api);
+
+ /*!
+ * Return if the current transaction can see the given transaction ID.
+ *
+ * @param wt_api the extension handle
+ * @param session the session handle
+ * @param transaction_id the transaction ID
+ * @returns true (non-zero) if the transaction ID is visible to the
+ * current transaction.
+ *
+ * @snippet ex_data_source.c WT_EXTENSION transaction visible
+ */
+ int (*transaction_visible)(WT_EXTENSION_API *wt_api,
+ WT_SESSION *session, uint64_t transaction_id);
+
+ /*!
+ * @copydoc wiredtiger_version
+ */
+ const char *(*version)(int *majorp, int *minorp, int *patchp);
+};
+
+/*!
+ * @typedef WT_CONFIG_ARG
+ *
+ * A configuration object passed to some extension interfaces. This is an
+ * opaque type: configuration values can be queried using
+ * WT_EXTENSION_API::config_get
+ */
+
+/*! @} */
+#endif /* SWIG */
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* __WIREDTIGER_EXT_H_ */
diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h
new file mode 100644
index 00000000000..e9482c688d3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/include/wt_internal.h
@@ -0,0 +1,337 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*******************************************
+ * WiredTiger public include file, and configuration control.
+ *******************************************/
+#include "wiredtiger_config.h"
+#include "wiredtiger_ext.h"
+
+/*******************************************
+ * WiredTiger system include files.
+ *******************************************/
+#ifndef _WIN32
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#endif
+#include <ctype.h>
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#ifdef _WIN32
+#include <io.h>
+#endif
+#include <limits.h>
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#include <time.h>
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+/*******************************************
+ * WiredTiger externally maintained include files.
+ *******************************************/
+#include "queue.h"
+
+/*
+ * DO NOT EDIT: automatically built by dist/s_typedef.
+ * Forward type declarations for internal types: BEGIN
+ */
+struct __wt_addr;
+ typedef struct __wt_addr WT_ADDR;
+struct __wt_async;
+ typedef struct __wt_async WT_ASYNC;
+struct __wt_async_cursor;
+ typedef struct __wt_async_cursor WT_ASYNC_CURSOR;
+struct __wt_async_format;
+ typedef struct __wt_async_format WT_ASYNC_FORMAT;
+struct __wt_async_op_impl;
+ typedef struct __wt_async_op_impl WT_ASYNC_OP_IMPL;
+struct __wt_async_worker_state;
+ typedef struct __wt_async_worker_state WT_ASYNC_WORKER_STATE;
+struct __wt_block;
+ typedef struct __wt_block WT_BLOCK;
+struct __wt_block_ckpt;
+ typedef struct __wt_block_ckpt WT_BLOCK_CKPT;
+struct __wt_block_desc;
+ typedef struct __wt_block_desc WT_BLOCK_DESC;
+struct __wt_block_header;
+ typedef struct __wt_block_header WT_BLOCK_HEADER;
+struct __wt_bloom;
+ typedef struct __wt_bloom WT_BLOOM;
+struct __wt_bloom_hash;
+ typedef struct __wt_bloom_hash WT_BLOOM_HASH;
+struct __wt_bm;
+ typedef struct __wt_bm WT_BM;
+struct __wt_btree;
+ typedef struct __wt_btree WT_BTREE;
+struct __wt_cache;
+ typedef struct __wt_cache WT_CACHE;
+struct __wt_cache_pool;
+ typedef struct __wt_cache_pool WT_CACHE_POOL;
+struct __wt_cell;
+ typedef struct __wt_cell WT_CELL;
+struct __wt_cell_unpack;
+ typedef struct __wt_cell_unpack WT_CELL_UNPACK;
+struct __wt_ckpt;
+ typedef struct __wt_ckpt WT_CKPT;
+struct __wt_col;
+ typedef struct __wt_col WT_COL;
+struct __wt_col_rle;
+ typedef struct __wt_col_rle WT_COL_RLE;
+struct __wt_colgroup;
+ typedef struct __wt_colgroup WT_COLGROUP;
+struct __wt_compact;
+ typedef struct __wt_compact WT_COMPACT;
+struct __wt_condvar;
+ typedef struct __wt_condvar WT_CONDVAR;
+struct __wt_config;
+ typedef struct __wt_config WT_CONFIG;
+struct __wt_config_check;
+ typedef struct __wt_config_check WT_CONFIG_CHECK;
+struct __wt_config_entry;
+ typedef struct __wt_config_entry WT_CONFIG_ENTRY;
+struct __wt_config_parser_impl;
+ typedef struct __wt_config_parser_impl WT_CONFIG_PARSER_IMPL;
+struct __wt_connection_impl;
+ typedef struct __wt_connection_impl WT_CONNECTION_IMPL;
+struct __wt_connection_stats;
+ typedef struct __wt_connection_stats WT_CONNECTION_STATS;
+struct __wt_connection_stats_spinlock;
+ typedef struct __wt_connection_stats_spinlock WT_CONNECTION_STATS_SPINLOCK;
+struct __wt_cursor_backup;
+ typedef struct __wt_cursor_backup WT_CURSOR_BACKUP;
+struct __wt_cursor_backup_entry;
+ typedef struct __wt_cursor_backup_entry WT_CURSOR_BACKUP_ENTRY;
+struct __wt_cursor_btree;
+ typedef struct __wt_cursor_btree WT_CURSOR_BTREE;
+struct __wt_cursor_bulk;
+ typedef struct __wt_cursor_bulk WT_CURSOR_BULK;
+struct __wt_cursor_config;
+ typedef struct __wt_cursor_config WT_CURSOR_CONFIG;
+struct __wt_cursor_data_source;
+ typedef struct __wt_cursor_data_source WT_CURSOR_DATA_SOURCE;
+struct __wt_cursor_dump;
+ typedef struct __wt_cursor_dump WT_CURSOR_DUMP;
+struct __wt_cursor_index;
+ typedef struct __wt_cursor_index WT_CURSOR_INDEX;
+struct __wt_cursor_json;
+ typedef struct __wt_cursor_json WT_CURSOR_JSON;
+struct __wt_cursor_log;
+ typedef struct __wt_cursor_log WT_CURSOR_LOG;
+struct __wt_cursor_lsm;
+ typedef struct __wt_cursor_lsm WT_CURSOR_LSM;
+struct __wt_cursor_metadata;
+ typedef struct __wt_cursor_metadata WT_CURSOR_METADATA;
+struct __wt_cursor_stat;
+ typedef struct __wt_cursor_stat WT_CURSOR_STAT;
+struct __wt_cursor_table;
+ typedef struct __wt_cursor_table WT_CURSOR_TABLE;
+struct __wt_data_handle;
+ typedef struct __wt_data_handle WT_DATA_HANDLE;
+struct __wt_data_handle_cache;
+ typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE;
+struct __wt_dlh;
+ typedef struct __wt_dlh WT_DLH;
+struct __wt_dsrc_stats;
+ typedef struct __wt_dsrc_stats WT_DSRC_STATS;
+struct __wt_evict_entry;
+ typedef struct __wt_evict_entry WT_EVICT_ENTRY;
+struct __wt_evict_worker;
+ typedef struct __wt_evict_worker WT_EVICT_WORKER;
+struct __wt_ext;
+ typedef struct __wt_ext WT_EXT;
+struct __wt_extlist;
+ typedef struct __wt_extlist WT_EXTLIST;
+struct __wt_fh;
+ typedef struct __wt_fh WT_FH;
+struct __wt_hazard;
+ typedef struct __wt_hazard WT_HAZARD;
+struct __wt_ikey;
+ typedef struct __wt_ikey WT_IKEY;
+struct __wt_index;
+ typedef struct __wt_index WT_INDEX;
+struct __wt_insert;
+ typedef struct __wt_insert WT_INSERT;
+struct __wt_insert_head;
+ typedef struct __wt_insert_head WT_INSERT_HEAD;
+struct __wt_log_desc;
+ typedef struct __wt_log_desc WT_LOG_DESC;
+struct __wt_log_op_desc;
+ typedef struct __wt_log_op_desc WT_LOG_OP_DESC;
+struct __wt_log_rec_desc;
+ typedef struct __wt_log_rec_desc WT_LOG_REC_DESC;
+struct __wt_lsm_chunk;
+ typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
+struct __wt_lsm_data_source;
+ typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE;
+struct __wt_lsm_manager;
+ typedef struct __wt_lsm_manager WT_LSM_MANAGER;
+struct __wt_lsm_tree;
+ typedef struct __wt_lsm_tree WT_LSM_TREE;
+struct __wt_lsm_work_unit;
+ typedef struct __wt_lsm_work_unit WT_LSM_WORK_UNIT;
+struct __wt_lsm_worker_args;
+ typedef struct __wt_lsm_worker_args WT_LSM_WORKER_ARGS;
+struct __wt_lsm_worker_cookie;
+ typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE;
+struct __wt_multi;
+ typedef struct __wt_multi WT_MULTI;
+struct __wt_named_collator;
+ typedef struct __wt_named_collator WT_NAMED_COLLATOR;
+struct __wt_named_compressor;
+ typedef struct __wt_named_compressor WT_NAMED_COMPRESSOR;
+struct __wt_named_data_source;
+ typedef struct __wt_named_data_source WT_NAMED_DATA_SOURCE;
+struct __wt_ovfl_reuse;
+ typedef struct __wt_ovfl_reuse WT_OVFL_REUSE;
+struct __wt_ovfl_track;
+ typedef struct __wt_ovfl_track WT_OVFL_TRACK;
+struct __wt_ovfl_txnc;
+ typedef struct __wt_ovfl_txnc WT_OVFL_TXNC;
+struct __wt_page;
+ typedef struct __wt_page WT_PAGE;
+struct __wt_page_deleted;
+ typedef struct __wt_page_deleted WT_PAGE_DELETED;
+struct __wt_page_header;
+ typedef struct __wt_page_header WT_PAGE_HEADER;
+struct __wt_page_index;
+ typedef struct __wt_page_index WT_PAGE_INDEX;
+struct __wt_page_modify;
+ typedef struct __wt_page_modify WT_PAGE_MODIFY;
+struct __wt_process;
+ typedef struct __wt_process WT_PROCESS;
+struct __wt_ref;
+ typedef struct __wt_ref WT_REF;
+struct __wt_row;
+ typedef struct __wt_row WT_ROW;
+struct __wt_rwlock;
+ typedef struct __wt_rwlock WT_RWLOCK;
+struct __wt_salvage_cookie;
+ typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_scratch_track;
+ typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
+struct __wt_session_impl;
+ typedef struct __wt_session_impl WT_SESSION_IMPL;
+struct __wt_size;
+ typedef struct __wt_size WT_SIZE;
+struct __wt_split_stash;
+ typedef struct __wt_split_stash WT_SPLIT_STASH;
+struct __wt_stats;
+ typedef struct __wt_stats WT_STATS;
+struct __wt_table;
+ typedef struct __wt_table WT_TABLE;
+struct __wt_txn;
+ typedef struct __wt_txn WT_TXN;
+struct __wt_txn_global;
+ typedef struct __wt_txn_global WT_TXN_GLOBAL;
+struct __wt_txn_op;
+ typedef struct __wt_txn_op WT_TXN_OP;
+struct __wt_txn_state;
+ typedef struct __wt_txn_state WT_TXN_STATE;
+struct __wt_upd_skipped;
+ typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
+struct __wt_update;
+ typedef struct __wt_update WT_UPDATE;
+/*
+ * Forward type declarations for internal types: END
+ * DO NOT EDIT: automatically built by dist/s_typedef.
+ */
+
+/*******************************************
+ * WiredTiger internal include files.
+ *******************************************/
+#if defined(_lint)
+#include "lint.h"
+#elif defined(__GNUC__)
+#include "gcc.h"
+#elif defined(_MSC_VER)
+#include "msvc.h"
+#endif
+#include "hardware.h"
+
+#ifdef _WIN32
+#include "os_windows.h"
+#else
+#include "posix.h"
+#endif
+
+#include "misc.h"
+#include "mutex.h"
+
+#include "stat.h" /* required by dhandle.h */
+#include "dhandle.h" /* required by btree.h */
+
+#include "api.h"
+#include "async.h"
+#include "block.h"
+#include "bloom.h"
+#include "btmem.h"
+#include "btree.h"
+#include "cache.h"
+#include "config.h"
+#include "compact.h"
+#include "cursor.h"
+#include "dlh.h"
+#include "error.h"
+#include "flags.h"
+#include "log.h"
+#include "lsm.h"
+#include "meta.h"
+#include "os.h"
+#include "schema.h"
+#include "txn.h"
+
+#include "session.h" /* required by connection.h */
+#include "connection.h"
+
+#include "extern.h"
+#include "verify_build.h"
+
+#include "buf.i"
+#include "misc.i"
+#include "intpack.i" /* required by cell.i, packing.i */
+#include "packing.i"
+#include "cell.i" /* required by btree.i */
+
+#include "mutex.i" /* required by btree.i */
+#include "txn.i" /* required by btree.i */
+
+#include "btree.i" /* required by cursor.i */
+#include "cache.i" /* required by cursor.i */
+#include "cursor.i"
+
+#include "bitstring.i"
+#include "column.i"
+#include "serial.i"
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
new file mode 100644
index 00000000000..d13002cdc5a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -0,0 +1,1243 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_log_ckpt --
+ * Record the given LSN as the checkpoint LSN and signal the archive
+ * thread as needed.
+ */
+int
+__wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ log->ckpt_lsn = *ckp_lsn;
+ if (conn->arch_cond != NULL)
+ WT_RET(__wt_cond_signal(session, conn->arch_cond));
+ return (0);
+}
+
+/*
+ * __wt_log_written_reset --
+ * Interface to reset the amount of log written during this
+ * during this checkpoint period. Called from the checkpoint code.
+ */
+void
+__wt_log_written_reset(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ if (!conn->logging)
+ return;
+ log = conn->log;
+ log->log_written = 0;
+ return;
+}
+
+/*
+ * __wt_log_get_files --
+ * Retrieve the list of all existing log files.
+ */
+int
+__wt_log_get_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_CONNECTION_IMPL *conn;
+ const char *log_path;
+
+ *countp = 0;
+ *filesp = NULL;
+
+ conn = S2C(session);
+ log_path = conn->log_path;
+ if (log_path == NULL)
+ log_path = "";
+ return (__wt_dirlist(session, log_path, WT_LOG_FILENAME,
+ WT_DIRLIST_INCLUDE, filesp, countp));
+}
+
+/*
+ * __wt_log_get_active_files --
+ * Retrieve the list of active log files (those that are not candidates
+ * for archiving).
+ */
+int
+__wt_log_get_active_files(
+ WT_SESSION_IMPL *session, char ***filesp, u_int *countp)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ char **files;
+ uint32_t id;
+ u_int count, i;
+
+ id = 0;
+ log = S2C(session)->log;
+
+ WT_RET(__wt_log_get_files(session, &files, &count));
+
+ /* Filter out any files that are below the checkpoint LSN. */
+ for (i = 0; i < count; ) {
+ WT_ERR(__wt_log_extract_lognum(session, files[i], &id));
+ if (id < log->ckpt_lsn.file) {
+ __wt_free(session, files[i]);
+ files[i] = files[count - 1];
+ files[--count] = NULL;
+ } else
+ i++;
+ }
+
+ *filesp = files;
+ *countp = count;
+
+ if (0) {
+err: __wt_log_files_free(session, files, count);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_log_files_free --
+ * Free memory associated with a log file list.
+ */
+void
+__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
+{
+ u_int i;
+
+ for (i = 0; i < count; i++)
+ __wt_free(session, files[i]);
+ __wt_free(session, files);
+}
+
+/*
+ * __wt_log_filename --
+ * Given a log number, return a WT_ITEM of a generated log file name.
+ */
+int
+__wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, WT_ITEM *buf)
+{
+ const char *log_path;
+
+ log_path = S2C(session)->log_path;
+
+ if (log_path != NULL && log_path[0] != '\0')
+ WT_RET(__wt_buf_fmt(session, buf, "%s/%s.%010" PRIu32,
+ log_path, WT_LOG_FILENAME, id));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%s.%010" PRIu32,
+ WT_LOG_FILENAME, id));
+
+ return (0);
+}
+
+/*
+ * __wt_log_extract_lognum --
+ * Given a log file name, extract out the log number.
+ */
+int
+__wt_log_extract_lognum(
+ WT_SESSION_IMPL *session, const char *name, uint32_t *id)
+{
+ const char *p;
+
+ WT_UNUSED(session);
+
+ if (id == NULL || name == NULL)
+ return (WT_ERROR);
+ if ((p = strrchr(name, '.')) == NULL ||
+ sscanf(++p, "%" PRIu32, id) != 1)
+ WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name);
+ return (0);
+}
+
+/*
+ * __wt_log_remove --
+ * Given a log number, remove that log file.
+ */
+int
+__wt_log_remove(WT_SESSION_IMPL *session, uint32_t lognum)
+{
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__wt_log_filename(session, lognum, path));
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_remove: remove log %s", (char *)path->data));
+ WT_ERR(__wt_remove(session, path->data));
+err: __wt_scr_free(&path);
+ return (ret);
+}
+
+/*
+ * __log_openfile --
+ * Open a log file with the given log file number and return the WT_FH.
+ */
+static int
+__log_openfile(WT_SESSION_IMPL *session, int ok_create, WT_FH **fh, uint32_t id)
+{
+ WT_DECL_ITEM(path);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &path));
+ WT_ERR(__wt_log_filename(session, id, path));
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "opening log %s", (const char *)path->data));
+ WT_ERR(__wt_open(
+ session, path->data, ok_create, 0, WT_FILE_TYPE_LOG, fh));
+err: __wt_scr_free(&path);
+ return (ret);
+}
+
+/*
+ * __wt_log_open --
+ * Open the appropriate log file for the connection. The purpose is
+ * to find the last log file that exists, open it and set our initial
+ * LSNs to the end of that file. If none exist, call __wt_log_newfile
+ * to create it.
+ */
+int
+__wt_log_open(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ uint32_t firstlog, lastlog, lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ log->fileid = lastlog;
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_open: first log %d last log %d", firstlog, lastlog));
+ log->first_lsn.file = firstlog;
+ log->first_lsn.offset = 0;
+
+ /*
+ * Start logging at the beginning of the next log file, no matter
+ * where the previous log file ends.
+ */
+ WT_ERR(__wt_log_newfile(session, 1));
+
+ /*
+ * If there were log files, run recovery.
+ * XXX belongs at a higher level than this.
+ */
+ if (logcount > 0) {
+ log->trunc_lsn = log->alloc_lsn;
+ WT_ERR(__wt_txn_recover(conn));
+ }
+
+err: __wt_log_files_free(session, logfiles, logcount);
+ return (ret);
+}
+
+/*
+ * __wt_log_close --
+ * Close the log file.
+ */
+int
+__wt_log_close(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) {
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "closing old log %s", log->log_close_fh->name));
+ WT_RET(__wt_close(session, log->log_close_fh));
+ }
+ if (log->log_fh != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "closing log %s", log->log_fh->name));
+ WT_RET(__wt_close(session, log->log_fh));
+ log->log_fh = NULL;
+ }
+ return (0);
+}
+
+/*
+ * __log_fill --
+ * Copy a thread's log records into the assigned slot.
+ */
+static int
+__log_fill(WT_SESSION_IMPL *session,
+ WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+{
+ WT_DECL_RET;
+ WT_LOG_RECORD *logrec;
+
+ logrec = (WT_LOG_RECORD *)record->mem;
+ /*
+ * Call __wt_write. For now the offset is the real byte offset.
+ * If the offset becomes a unit of LOG_ALIGN this is where we would
+ * multiply by LOG_ALIGN to get the real file byte offset for write().
+ */
+ if (direct)
+ WT_ERR(__wt_write(session, myslot->slot->slot_fh,
+ myslot->offset + myslot->slot->slot_start_offset,
+ (size_t)logrec->len, (void *)logrec));
+ else
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+ logrec, logrec->len);
+
+ WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
+ if (lsnp != NULL) {
+ *lsnp = myslot->slot->slot_start_lsn;
+ lsnp->offset += (wt_off_t)myslot->offset;
+ }
+err:
+ if (ret != 0 && myslot->slot->slot_error == 0)
+ myslot->slot->slot_error = ret;
+ return (ret);
+}
+
+/*
+ * __log_size_fit --
+ * Return whether or not recsize will fit in the log file.
+ */
+static int
+__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
+}
+
+/*
+ * __log_truncate --
+ * Truncate the log to the given LSN. If this_log is set, it will only
+ * truncate the log file indicated in the given LSN. If not set,
+ * it will truncate between the given LSN and the trunc_lsn. That is,
+ * since we pre-allocate log files, it will free that space and allow the
+ * log to be traversed. We use the trunc_lsn because logging has already
+ * opened the new/next log file before recovery ran. This function assumes
+ * we are in recovery or other dedicated time and not during live running.
+ */
+static int
+__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, uint32_t this_log)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh, *tmp_fh;
+ WT_LOG *log;
+ uint32_t lognum;
+ u_int i, logcount;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+
+ /*
+ * Truncate the log file to the given LSN.
+ */
+ WT_ERR(__log_openfile(session, 0, &log_fh, lsn->file));
+ WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+
+ /*
+ * If we just want to truncate the current log, return and skip
+ * looking for intervening logs.
+ */
+ if (this_log)
+ goto err;
+ WT_ERR(__wt_log_get_files(session, &logfiles, &logcount));
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
+ if (lognum > lsn->file && lognum < log->trunc_lsn.file) {
+ WT_ERR(__log_openfile(session, 0, &log_fh, lognum));
+ /*
+ * If there are intervening files pre-allocated,
+ * truncate them to the end of the log file header.
+ */
+ WT_ERR(__wt_ftruncate(session,
+ log_fh, LOG_FIRST_RECORD));
+ tmp_fh = log_fh;
+ log_fh = NULL;
+ WT_ERR(__wt_close(session, tmp_fh));
+ }
+ }
+err: if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ return (ret);
+}
+
+/*
+ * __log_filesize --
+ * Returns an estimate of the real end of log file.
+ */
+static int
+__log_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *eof)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ wt_off_t log_size, off, off1;
+ uint32_t allocsize, bufsz;
+ char *buf, *zerobuf;
+
+ conn = S2C(session);
+ log = conn->log;
+ if (eof == NULL)
+ return (0);
+ *eof = 0;
+ WT_RET(__wt_filesize(session, fh, &log_size));
+ if (log == NULL)
+ allocsize = LOG_ALIGN;
+ else
+ allocsize = log->allocsize;
+
+ /*
+ * It can be very slow looking for the last real record in the log
+ * in very small chunks. Walk backward by a megabyte at a time. When
+ * we find a part of the log that is not just zeroes, walk to find
+ * the last record.
+ */
+ buf = zerobuf = NULL;
+ if (allocsize < WT_MEGABYTE && log_size > WT_MEGABYTE)
+ bufsz = WT_MEGABYTE;
+ else
+ bufsz = allocsize;
+ WT_RET(__wt_calloc_def(session, bufsz, &buf));
+ WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf));
+
+ /*
+ * Read in a chunk starting at the end of the file. Keep going until
+ * we reach the beginning or we find a chunk that contains any non-zero
+ * bytes. Compare against a known zero byte chunk.
+ */
+ for (off = log_size - (wt_off_t)bufsz;
+ off >= 0;
+ off -= (wt_off_t)bufsz) {
+ WT_ERR(__wt_read(session, fh, off, bufsz, buf));
+ if (memcmp(buf, zerobuf, bufsz) != 0)
+ break;
+ }
+
+ /*
+ * If we're walking by large amounts, now walk by the real allocsize
+ * to find the real end, if we found something. Otherwise we reached
+ * the beginning of the file. Offset can go negative if the log file
+ * size is not a multiple of a megabyte. The first chunk of the log
+ * file will always be non-zero.
+ */
+ if (off < 0)
+ off = 0;
+
+ /*
+ * We know all log records are aligned at log->allocsize. The first
+ * item in a log record is always a 32-bit length. Look for any
+ * non-zero length at the allocsize boundary. This may not be a true
+ * log record since it could be the middle of a large record. But we
+ * know no log record starts after it. Return an estimate of the log
+ * file size.
+ */
+ for (off1 = bufsz - allocsize;
+ off1 > 0; off1 -= (wt_off_t)allocsize)
+ if (memcmp(buf + off1, zerobuf, sizeof(uint32_t)) != 0)
+ break;
+ off = off + off1;
+
+ /*
+ * Set EOF to the last zero-filled record we saw.
+ */
+ *eof = off + (wt_off_t)allocsize;
+err:
+ if (buf != NULL)
+ __wt_free(session, buf);
+ if (zerobuf != NULL)
+ __wt_free(session, zerobuf);
+ return (ret);
+}
+
+/*
+ * __log_acquire --
+ * Called with the log slot lock held. Can be called recursively
+ * from __wt_log_newfile when we change log files.
+ */
+static int
+__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * Called locked. Add recsize to alloc_lsn. Save our starting LSN
+ * where the previous allocation finished for the release LSN.
+ * That way when log files switch, we're waiting for the correct LSN
+ * from outstanding writes.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+ WT_RET(__wt_log_newfile(session, 0));
+ if (log->log_close_fh != NULL)
+ F_SET(slot, SLOT_CLOSEFH);
+ }
+ /*
+ * Checkpoints can be configured based on amount of log written.
+ * Add in this log record to the sum and if needed, signal the
+ * checkpoint condition. The logging subsystem manages the
+ * accumulated field. There is a bit of layering violation
+ * here checking the connection ckpt field and using its
+ * condition.
+ */
+ if (WT_CKPT_LOGSIZE(conn)) {
+ log->log_written += (wt_off_t)recsize;
+ WT_RET(__wt_checkpoint_signal(session, log->log_written));
+ }
+
+ /*
+ * Need to minimally fill in slot info here. Our slot start LSN
+ * comes after any potential new log file creations.
+ */
+ slot->slot_start_lsn = log->alloc_lsn;
+ slot->slot_start_offset = log->alloc_lsn.offset;
+ /*
+ * Pre-allocate on the first real write into the log file.
+ */
+ if (log->alloc_lsn.offset == LOG_FIRST_RECORD) {
+ if (!log->log_fh->fallocate_available ||
+ (ret = __wt_fallocate(session, log->log_fh,
+ LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
+ ret = __wt_ftruncate(session, log->log_fh,
+ LOG_FIRST_RECORD + conn->log_file_max);
+ WT_RET(ret);
+ }
+
+ log->alloc_lsn.offset += (wt_off_t)recsize;
+ slot->slot_end_lsn = log->alloc_lsn;
+ slot->slot_error = 0;
+ slot->slot_fh = log->log_fh;
+ return (0);
+}
+
+/*
+ * __log_release --
+ * Release a log slot.
+ */
+static int
+__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *close_fh;
+ WT_LOG *log;
+ WT_LSN sync_lsn;
+ size_t write_size;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If we're going to have to close our log file, make a local copy
+ * of the file handle structure.
+ */
+ close_fh = NULL;
+ if (F_ISSET(slot, SLOT_CLOSEFH)) {
+ close_fh = log->log_close_fh;
+ log->log_close_fh = NULL;
+ F_CLR(slot, SLOT_CLOSEFH);
+ }
+
+ /* Write the buffered records */
+ if (F_ISSET(slot, SLOT_BUFFERED)) {
+ write_size = (size_t)
+ (slot->slot_end_lsn.offset - slot->slot_start_offset);
+ WT_ERR(__wt_write(session, slot->slot_fh,
+ slot->slot_start_offset, write_size, slot->slot_buf.mem));
+ }
+
+ /*
+ * Wait for earlier groups to finish, otherwise there could be holes
+ * in the log file.
+ */
+ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
+ __wt_yield();
+ log->write_lsn = slot->slot_end_lsn;
+ /*
+ * Try to consolidate calls to fsync to wait less. Acquire a spin lock
+ * so that threads finishing writing to the log will wait while the
+ * current fsync completes and advance log->write_lsn.
+ */
+ while (F_ISSET(slot, SLOT_SYNC) &&
+ LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
+ (void)__wt_cond_wait(
+ session, log->log_sync_cond, 10000);
+ continue;
+ }
+ /*
+ * Record the current end of log after we grabbed the lock.
+ * That is how far our fsync call with guarantee.
+ */
+ sync_lsn = log->write_lsn;
+ if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ WT_STAT_FAST_CONN_INCR(session, log_sync);
+ ret = __wt_fsync(session, log->log_fh);
+ if (ret == 0) {
+ F_CLR(slot, SLOT_SYNC);
+ log->sync_lsn = sync_lsn;
+ ret = __wt_cond_signal(
+ session, log->log_sync_cond);
+ }
+ }
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ WT_ERR(ret);
+ }
+ if (F_ISSET(slot, SLOT_BUF_GROW)) {
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, slot->slot_buf.memsize);
+ WT_ERR(__wt_buf_grow(session,
+ &slot->slot_buf, slot->slot_buf.memsize * 2));
+ }
+ /*
+ * If we have a file to close, close it now.
+ */
+ if (close_fh)
+ WT_ERR(__wt_close(session, close_fh));
+
+err: if (ret != 0 && slot->slot_error == 0)
+ slot->slot_error = ret;
+ return (ret);
+}
+
+/*
+ * __wt_log_newfile --
+ * Create the next log file and write the file header record into it.
+ */
+int
+__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_DESC *desc;
+ WT_LOG_RECORD *logrec;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * Set aside the log file handle to be closed later. Other threads
+ * may still be using it to write to the log. If the log file size
+ * is small we could fill a log file before the previous one is closed.
+ * Wait for that to close.
+ */
+ while (log->log_close_fh != NULL) {
+ __wt_errx(session,
+ "log_newfile: Log file size %" PRIuMAX " too small",
+ (uintmax_t)conn->log_file_max);
+ WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+ __wt_yield();
+ }
+ log->log_close_fh = log->log_fh;
+ log->fileid++;
+ WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid));
+ log->alloc_lsn.file = log->fileid;
+ log->alloc_lsn.offset = log->log_fh->size;
+
+ /*
+ * Set up the log descriptor record. Use a scratch buffer to
+ * get correct alignment for direct I/O.
+ */
+ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize);
+ WT_RET(__wt_scr_alloc(session, log->allocsize, &buf));
+ memset(buf->mem, 0, log->allocsize);
+ logrec = (WT_LOG_RECORD *)buf->mem;
+ desc = (WT_LOG_DESC *)logrec->record;
+ desc->log_magic = WT_LOG_MAGIC;
+ desc->majorv = WT_LOG_MAJOR_VERSION;
+ desc->minorv = WT_LOG_MINOR_VERSION;
+ desc->log_size = (uint64_t)conn->log_file_max;
+
+ /*
+ * Now that the record is set up, initialize the record header.
+ */
+ logrec->len = log->allocsize;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, log->allocsize);
+ WT_CLEAR(tmp);
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+
+ /*
+ * Recursively call __log_acquire to allocate log space for the
+ * log descriptor record. Call __log_fill to write it, but we
+ * do not need to call __log_release because we're not waiting for
+ * earlier operations to complete.
+ */
+ WT_ERR(__log_acquire(session, logrec->len, &tmp));
+ WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
+
+ /*
+ * If we're called from connection creation code, we need to update
+ * the LSNs since we're the only write in progress.
+ */
+ if (conn_create) {
+ WT_ERR(__wt_fsync(session, log->log_fh));
+ log->sync_lsn = tmp.slot_end_lsn;
+ log->write_lsn = tmp.slot_end_lsn;
+ }
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_log_read --
+ * Read the log record at the given LSN. Return the record (including
+ * the log header) in the WT_ITEM. Caller is responsible for freeing it.
+ */
+int
+__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ uint32_t cksum, rdup_len, reclen;
+
+ WT_UNUSED(flags);
+ /*
+ * If the caller didn't give us an LSN or something to return,
+ * there's nothing to do.
+ */
+ if (lsnp == NULL || record == NULL)
+ return (0);
+ conn = S2C(session);
+ log = conn->log;
+ /*
+ * If the offset isn't on an allocation boundary it must be wrong.
+ */
+ if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
+
+ WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file));
+ /*
+ * Read the minimum allocation size a record could be.
+ */
+ WT_ERR(__wt_buf_init(session, record, log->allocsize));
+ WT_ERR(__wt_read(session,
+ log_fh, lsnp->offset, (size_t)log->allocsize, record->mem));
+ /*
+ * First 4 bytes is the real record length. See if we
+ * need to read more than the allocation size. We expect
+ * that we rarely will have to read more. Most log records
+ * will be fairly small.
+ */
+ reclen = *(uint32_t *)record->mem;
+ if (reclen == 0) {
+ ret = WT_NOTFOUND;
+ goto err;
+ }
+ if (reclen > log->allocsize) {
+ rdup_len = __wt_rduppo2(reclen, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ERR(__wt_read(session,
+ log_fh, lsnp->offset, (size_t)rdup_len, record->mem));
+ }
+ /*
+ * We read in the record, verify checksum.
+ */
+ logrec = (WT_LOG_RECORD *)record->mem;
+ cksum = logrec->checksum;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, logrec->len);
+ if (logrec->checksum != cksum)
+ WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
+ record->size = logrec->len;
+ WT_STAT_FAST_CONN_INCR(session, log_reads);
+err:
+ WT_TRET(__wt_close(session, log_fh));
+ return (ret);
+}
+
+/*
+ * __wt_log_scan --
+ * Scan the logs, calling a function on each record found.
+ */
+int
+__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
+ int (*func)(WT_SESSION_IMPL *session,
+ WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_ITEM buf;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN end_lsn, rd_lsn, start_lsn;
+ wt_off_t log_size;
+ uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
+ u_int i, logcount;
+ int eol;
+ char **logfiles;
+
+ conn = S2C(session);
+ log = conn->log;
+ log_fh = NULL;
+ logcount = 0;
+ logfiles = NULL;
+ eol = 0;
+ WT_CLEAR(buf);
+
+ /*
+ * If the caller did not give us a callback function there is nothing
+ * to do.
+ */
+ if (func == NULL)
+ return (0);
+
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_RET(__wt_verbose(session, WT_VERB_LOG,
+ "__wt_log_scan truncating to %u/%" PRIuMAX,
+ log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));
+
+ if (log != NULL) {
+ allocsize = log->allocsize;
+
+ if (lsnp == NULL) {
+ if (LF_ISSET(WT_LOGSCAN_FIRST))
+ start_lsn = log->first_lsn;
+ else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
+ start_lsn = log->ckpt_lsn;
+ else
+ return (WT_ERROR); /* Illegal usage */
+ } else {
+ if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
+ WT_RET_MSG(session, WT_ERROR,
+ "choose either a start LSN or a start flag");
+
+ /* Offsets must be on allocation boundaries. */
+ if (lsnp->offset % allocsize != 0 ||
+ lsnp->file > log->fileid)
+ return (WT_NOTFOUND);
+
+ /*
+ * Log cursors may not know the starting LSN. If an
+ * LSN pointer is passed in, but it is the INIT_LSN,
+ * start from the first_lsn.
+ */
+ start_lsn = *lsnp;
+ if (IS_INIT_LSN(&start_lsn))
+ start_lsn = log->first_lsn;
+ }
+ end_lsn = log->alloc_lsn;
+ } else {
+ /*
+ * If logging is not configured, we can still print out the log
+ * if log files exist. We just need to set the LSNs from what
+ * is in the files versus what is in the live connection.
+ */
+ /*
+ * Set allocsize to the minimum alignment it could be. Larger
+ * records and larger allocation boundaries should always be
+ * a multiple of this.
+ */
+ allocsize = LOG_ALIGN;
+ lastlog = 0;
+ firstlog = UINT32_MAX;
+ WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
+ if (logcount == 0)
+ /*
+ * Return it is not supported if none don't exist.
+ */
+ return (ENOTSUP);
+ for (i = 0; i < logcount; i++) {
+ WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
+ &lognum));
+ lastlog = WT_MAX(lastlog, lognum);
+ firstlog = WT_MIN(firstlog, lognum);
+ }
+ start_lsn.file = firstlog;
+ end_lsn.file = lastlog;
+ start_lsn.offset = end_lsn.offset = 0;
+ __wt_log_files_free(session, logfiles, logcount);
+ logfiles = NULL;
+ }
+ WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
+ WT_ERR(__log_filesize(session, log_fh, &log_size));
+ rd_lsn = start_lsn;
+ WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
+ for (;;) {
+ if (rd_lsn.offset + allocsize > log_size) {
+advance:
+ /*
+ * If we read the last record, go to the next file.
+ */
+ WT_ERR(__wt_close(session, log_fh));
+ log_fh = NULL;
+ eol = 1;
+ /*
+ * Truncate this log file before we move to the next.
+ */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER))
+ WT_ERR(__log_truncate(session, &rd_lsn, 1));
+ rd_lsn.file++;
+ rd_lsn.offset = 0;
+ /*
+ * Avoid an error message when we reach end of log
+ * by checking here.
+ */
+ if (rd_lsn.file > end_lsn.file)
+ break;
+ WT_ERR(__log_openfile(
+ session, 0, &log_fh, rd_lsn.file));
+ WT_ERR(__log_filesize(session, log_fh, &log_size));
+ continue;
+ }
+ /*
+ * Read the minimum allocation size a record could be.
+ */
+ WT_ASSERT(session, buf.memsize >= allocsize);
+ WT_ERR(__wt_read(session,
+ log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
+ /*
+ * First 8 bytes is the real record length. See if we
+ * need to read more than the allocation size. We expect
+ * that we rarely will have to read more. Most log records
+ * will be fairly small.
+ */
+ reclen = *(uint32_t *)buf.mem;
+ /*
+ * Log files are pre-allocated. We never expect a zero length
+ * unless we've reached the end of the log. The log can be
+ * written out of order, so when recovery finds the end of
+ * the log, truncate the file and remove any later log files
+ * that may exist.
+ */
+ if (reclen == 0) {
+ /* This LSN is the end. */
+ break;
+ }
+ rdup_len = __wt_rduppo2(reclen, allocsize);
+ if (reclen > allocsize) {
+ /*
+ * The log file end could be the middle of this
+ * log record.
+ */
+ if (rd_lsn.offset + rdup_len > log_size)
+ goto advance;
+ /*
+ * We need to round up and read in the full padded
+ * record, especially for direct I/O.
+ */
+ WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
+ WT_ERR(__wt_read(session,
+ log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
+ WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
+ }
+ /*
+ * We read in the record, verify checksum.
+ */
+ buf.size = reclen;
+ logrec = (WT_LOG_RECORD *)buf.mem;
+ cksum = logrec->checksum;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, logrec->len);
+ if (logrec->checksum != cksum) {
+ /*
+ * A checksum mismatch means we have reached the end of
+ * the useful part of the log. This should be found on
+ * the first pass through recovery. In the second pass
+ * where we truncate the log, this is where it should
+ * end.
+ */
+ if (log != NULL)
+ log->trunc_lsn = rd_lsn;
+ break;
+ }
+
+ /*
+ * We have a valid log record. If it is not the log file
+ * header, invoke the callback.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_scan_records);
+ if (rd_lsn.offset != 0) {
+ WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
+ if (LF_ISSET(WT_LOGSCAN_ONE))
+ break;
+ }
+ rd_lsn.offset += (wt_off_t)rdup_len;
+ }
+
+ /* Truncate if we're in recovery. */
+ if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
+ LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ WT_ERR(__log_truncate(session, &rd_lsn, 0));
+
+err: WT_STAT_FAST_CONN_INCR(session, log_scans);
+ if (logfiles != NULL)
+ __wt_log_files_free(session, logfiles, logcount);
+ __wt_buf_free(session, &buf);
+ /*
+ * If the caller wants one record and it is at the end of log,
+ * return WT_NOTFOUND.
+ */
+ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
+ ret = WT_NOTFOUND;
+ if (ret == ENOENT)
+ ret = 0;
+ if (log_fh != NULL)
+ WT_TRET(__wt_close(session, log_fh));
+ return (ret);
+}
+
+/*
+ * __log_direct_write --
+ * Write a log record without using the consolidation arrays.
+ */
+static int
+__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT tmp;
+ WT_MYSLOT myslot;
+ int locked;
+ WT_DECL_SPINLOCK_ID(id); /* Must appear last */
+
+ log = S2C(session)->log;
+ myslot.slot = &tmp;
+ myslot.offset = 0;
+ WT_CLEAR(tmp);
+
+ /* Fast path the contended case. */
+ if (__wt_spin_trylock(session, &log->log_slot_lock, &id) != 0)
+ return (EAGAIN);
+ locked = 1;
+
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(&tmp, SLOT_SYNC);
+ WT_ERR(__log_acquire(session, record->size, &tmp));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
+ WT_ERR(__log_release(session, &tmp));
+
+err: if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ return (ret);
+}
+
+/*
+ * __wt_log_write --
+ * Write a record into the log.
+ */
+int
+__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
+ uint32_t flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOG_RECORD *logrec;
+ WT_LSN lsn;
+ WT_MYSLOT myslot;
+ uint32_t rdup_len;
+ int locked;
+
+ conn = S2C(session);
+ log = conn->log;
+ locked = 0;
+ INIT_LSN(&lsn);
+ myslot.slot = NULL;
+ /*
+ * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
+ * a header at the beginning for us to fill in.
+ *
+ * If using direct_io, the caller should pass us an aligned record.
+ * But we need to make sure it is big enough and zero-filled so
+ * that we can write the full amount. Do this whether or not
+ * direct_io is in use because it makes the reading code cleaner.
+ */
+ WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size);
+ rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
+ WT_ERR(__wt_buf_grow(session, record, rdup_len));
+ WT_ASSERT(session, record->data == record->mem);
+ /*
+ * If the caller's record only partially fills the necessary
+ * space, we need to zero-fill the remainder.
+ */
+ if (record->size != rdup_len) {
+ memset((uint8_t *)record->mem + record->size, 0,
+ rdup_len - record->size);
+ record->size = rdup_len;
+ }
+ logrec = (WT_LOG_RECORD *)record->mem;
+ logrec->len = (uint32_t)record->size;
+ logrec->checksum = 0;
+ logrec->checksum = __wt_cksum(logrec, record->size);
+
+ WT_STAT_FAST_CONN_INCR(session, log_writes);
+
+ if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
+ ret = __log_direct_write(session, record, lsnp, flags);
+ if (ret == 0)
+ return (0);
+ if (ret != EAGAIN)
+ WT_ERR(ret);
+ /*
+ * An EAGAIN return means we failed to get the try lock -
+ * fall through to the consolidation code in that case.
+ */
+ }
+
+ /*
+ * As soon as we see contention for the log slot, disable direct
+ * log writes. We get better performance by forcing writes through
+ * the consolidation code. This is because individual writes flood
+ * the I/O system faster than they contend on the log slot lock.
+ */
+ F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
+ if ((ret = __wt_log_slot_join(
+ session, rdup_len, flags, &myslot)) == ENOMEM) {
+ /*
+ * If we couldn't find a consolidated slot for this record
+ * write the record directly.
+ */
+ while ((ret = __log_direct_write(
+ session, record, lsnp, flags)) == EAGAIN)
+ ;
+ WT_ERR(ret);
+ /*
+ * Increase the buffer size of any slots we can get access
+ * to, so future consolidations are likely to succeed.
+ */
+ WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
+ return (0);
+ }
+ WT_ERR(ret);
+ if (myslot.offset == 0) {
+ __wt_spin_lock(session, &log->log_slot_lock);
+ locked = 1;
+ WT_ERR(__wt_log_slot_close(session, myslot.slot));
+ WT_ERR(__log_acquire(
+ session, myslot.slot->slot_group_size, myslot.slot));
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ locked = 0;
+ WT_ERR(__wt_log_slot_notify(session, myslot.slot));
+ } else
+ WT_ERR(__wt_log_slot_wait(session, myslot.slot));
+ WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
+ if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
+ WT_ERR(__log_release(session, myslot.slot));
+ WT_ERR(__wt_log_slot_free(myslot.slot));
+ } else if (LF_ISSET(WT_LOG_FSYNC)) {
+ /* Wait for our writes to reach disk */
+ while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+ myslot.slot->slot_error == 0)
+ (void)__wt_cond_wait(
+ session, log->log_sync_cond, 10000);
+ }
+err:
+ if (locked)
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ if (ret == 0 && lsnp != NULL)
+ *lsnp = lsn;
+ /*
+ * If we're synchronous and some thread had an error, we don't know
+ * if our write made it out to the file or not. The error could be
+ * before or after us. So, if anyone got an error, we report it.
+ * If we're not synchronous, only report if our own operation got
+ * an error.
+ */
+ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
+ myslot.slot != NULL)
+ ret = myslot.slot->slot_error;
+ return (ret);
+}
+
+/*
+ * __wt_log_vprintf --
+ * Write a message into the log.
+ */
+int
+__wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ va_list ap_copy;
+ const char *rec_fmt = WT_UNCHECKED_STRING(I);
+ uint32_t rectype = WT_LOGREC_MESSAGE;
+ size_t header_size, len;
+
+ conn = S2C(session);
+
+ if (!conn->logging)
+ return (0);
+
+ va_copy(ap_copy, ap);
+ len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1;
+ va_end(ap_copy);
+
+ WT_RET(
+ __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec));
+
+ /*
+ * We're writing a record with the type (an integer) followed by a
+ * string (NUL-terminated data). To avoid writing the string into
+ * a buffer before copying it, we write the header first, then the
+ * raw bytes of the string.
+ */
+ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype));
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ rec_fmt, rectype));
+ logrec->size += (uint32_t)header_size;
+
+ (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LOG,
+ "log_printf: %s", (char *)logrec->data + logrec->size));
+
+ logrec->size += len;
+ WT_ERR(__wt_log_write(session, logrec, NULL, 0));
+err: __wt_scr_free(&logrec);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/log/log_auto.c b/src/third_party/wiredtiger/src/log/log_auto.c
new file mode 100644
index 00000000000..f3db79f4daf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log_auto.c
@@ -0,0 +1,437 @@
+/* DO NOT EDIT: automatically built by dist/log.py. */
+
+#include "wt_internal.h"
+
+int
+__wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp)
+{
+ WT_ITEM *logrec;
+
+ WT_RET(__wt_scr_alloc(session, WT_ALIGN(size + 1, LOG_ALIGN), &logrec));
+ WT_CLEAR(*(WT_LOG_RECORD *)logrec->data);
+ logrec->size = offsetof(WT_LOG_RECORD, record);
+
+ *logrecp = logrec;
+ return (0);
+}
+
+void
+__wt_logrec_free(WT_SESSION_IMPL *session, WT_ITEM **logrecp)
+{
+ WT_UNUSED(session);
+ __wt_scr_free(logrecp);
+}
+
+int
+__wt_logrec_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end, uint32_t *rectypep)
+{
+ uint64_t rectype;
+
+ WT_UNUSED(session);
+ WT_RET(__wt_vunpack_uint(pp, WT_PTRDIFF(end, *pp), &rectype));
+ *rectypep = (uint32_t)rectype;
+ return (0);
+}
+
+int
+__wt_logop_read(WT_SESSION_IMPL *session,
+ const uint8_t **pp, const uint8_t *end,
+ uint32_t *optypep, uint32_t *opsizep)
+{
+ return (__wt_struct_unpack(
+ session, *pp, WT_PTRDIFF(end, *pp), "II", optypep, opsizep));
+}
+
+int
+__wt_logop_col_put_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_PUT;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno, value));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_put_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop, WT_ITEM *valuep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIru);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop, valuep));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_PUT);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_put_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t recno;
+ WT_ITEM value;
+
+ WT_RET(__wt_logop_col_put_unpack(
+ session, pp, end, &fileid, &recno, &value));
+
+ fprintf(out, " \"optype\": \"col_put\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno);
+ fprintf(out, " \"value\": \"%.*s\",\n",
+ (int)value.size, (const char *)value.data);
+ return (0);
+}
+
+int
+__wt_logop_col_remove_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t recno)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIr);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_REMOVE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, recno));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, recno));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_remove_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *recnop)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIr);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, recnop));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_REMOVE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_remove_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t recno;
+
+ WT_RET(__wt_logop_col_remove_unpack(
+ session, pp, end, &fileid, &recno));
+
+ fprintf(out, " \"optype\": \"col_remove\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"recno\": \"%" PRIu64 "\",\n", recno);
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, uint64_t start, uint64_t stop)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_COL_TRUNCATE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, start, stop));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, start, stop));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, uint64_t *startp, uint64_t *stopp)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIrr);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, startp, stopp));
+ WT_ASSERT(session, optype == WT_LOGOP_COL_TRUNCATE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_col_truncate_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ uint64_t start;
+ uint64_t stop;
+
+ WT_RET(__wt_logop_col_truncate_unpack(
+ session, pp, end, &fileid, &start, &stop));
+
+ fprintf(out, " \"optype\": \"col_truncate\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"start\": \"%" PRIu64 "\",\n", start);
+ fprintf(out, " \"stop\": \"%" PRIu64 "\",\n", stop);
+ return (0);
+}
+
+int
+__wt_logop_row_put_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key, WT_ITEM *value)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_PUT;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key, value));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key, value));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_put_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp, WT_ITEM *valuep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuu);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp, valuep));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_PUT);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_put_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM key;
+ WT_ITEM value;
+
+ WT_RET(__wt_logop_row_put_unpack(
+ session, pp, end, &fileid, &key, &value));
+
+ fprintf(out, " \"optype\": \"row_put\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"key\": \"%.*s\",\n",
+ (int)key.size, (const char *)key.data);
+ fprintf(out, " \"value\": \"%.*s\",\n",
+ (int)value.size, (const char *)value.data);
+ return (0);
+}
+
+int
+__wt_logop_row_remove_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *key)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_REMOVE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, key));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, key));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_remove_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *keyp)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIu);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, keyp));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_REMOVE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_remove_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM key;
+
+ WT_RET(__wt_logop_row_remove_unpack(
+ session, pp, end, &fileid, &key));
+
+ fprintf(out, " \"optype\": \"row_remove\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"key\": \"%.*s\",\n",
+ (int)key.size, (const char *)key.data);
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_pack(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec,
+ uint32_t fileid, WT_ITEM *start, WT_ITEM *stop, uint32_t mode)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+ size_t size;
+ uint32_t optype, recsize;
+
+ optype = WT_LOGOP_ROW_TRUNCATE;
+ WT_RET(__wt_struct_size(session, &size, fmt,
+ optype, 0, fileid, start, stop, mode));
+
+ __wt_struct_size_adjust(session, &size);
+ WT_RET(__wt_buf_extend(session, logrec, logrec->size + size));
+ recsize = (uint32_t)size;
+ WT_RET(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, size, fmt,
+ optype, recsize, fileid, start, stop, mode));
+
+ logrec->size += (uint32_t)size;
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_unpack(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep)
+{
+ const char *fmt = WT_UNCHECKED_STRING(IIIuuI);
+ uint32_t optype, size;
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &optype, &size, fileidp, startp, stopp, modep));
+ WT_ASSERT(session, optype == WT_LOGOP_ROW_TRUNCATE);
+
+ *pp += size;
+ return (0);
+}
+
+int
+__wt_logop_row_truncate_print(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t fileid;
+ WT_ITEM start;
+ WT_ITEM stop;
+ uint32_t mode;
+
+ WT_RET(__wt_logop_row_truncate_unpack(
+ session, pp, end, &fileid, &start, &stop, &mode));
+
+ fprintf(out, " \"optype\": \"row_truncate\",\n");
+ fprintf(out, " \"fileid\": \"%" PRIu32 "\",\n", fileid);
+ fprintf(out, " \"start\": \"%.*s\",\n",
+ (int)start.size, (const char *)start.data);
+ fprintf(out, " \"stop\": \"%.*s\",\n",
+ (int)stop.size, (const char *)stop.data);
+ fprintf(out, " \"mode\": \"%" PRIu32 "\",\n", mode);
+ return (0);
+}
+
+int
+__wt_txn_op_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ uint32_t optype, opsize;
+
+ /* Peek at the size and the type. */
+ WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_RET(__wt_logop_col_put_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_RET(__wt_logop_col_remove_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_RET(__wt_logop_col_truncate_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_RET(__wt_logop_row_put_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_RET(__wt_logop_row_remove_print(session, pp, end, out));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_RET(__wt_logop_row_truncate_print(session, pp, end, out));
+ break;
+
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
new file mode 100644
index 00000000000..c12f47d231b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -0,0 +1,354 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This file implements the consolidated array algorithm as described in
+ * the paper:
+ * Scalability of write-ahead logging on multicore and multisocket hardware
+ * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis
+ * and Anastasia Ailamaki.
+ *
+ * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can
+ * be found at:
+ * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf
+ */
+
+/*
+ * __wt_log_slot_init --
+ * Initialize the slot array.
+ */
+int
+__wt_log_slot_init(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int32_t i;
+
+ conn = S2C(session);
+ log = conn->log;
+ for (i = 0; i < SLOT_POOL; i++) {
+ log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
+ log->slot_pool[i].slot_index = SLOT_INVALID_INDEX;
+ }
+
+ /*
+ * Set up the available slots from the pool the first time.
+ */
+ for (i = 0; i < SLOT_ACTIVE; i++) {
+ slot = &log->slot_pool[i];
+ slot->slot_index = (uint32_t)i;
+ slot->slot_state = WT_LOG_SLOT_READY;
+ log->slot_array[i] = slot;
+ }
+
+ /*
+ * Allocate memory for buffers now that the arrays are setup. Split
+ * this out to make error handling simpler.
+ */
+ for (i = 0; i < SLOT_POOL; i++) {
+ WT_ERR(__wt_buf_init(session,
+ &log->slot_pool[i].slot_buf, WT_LOG_SLOT_BUF_INIT_SIZE));
+ F_SET(&log->slot_pool[i], SLOT_BUFFERED);
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_buffer_size, WT_LOG_SLOT_BUF_INIT_SIZE * SLOT_POOL);
+ if (0) {
+err: while (--i >= 0)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_log_slot_destroy --
+ * Clean up the slot array on shutdown.
+ */
+int
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ for (i = 0; i < SLOT_POOL; i++)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ return (0);
+}
+
+/*
+ * __wt_log_slot_join --
+ * Join a consolidated logging slot. Callers should be prepared to deal
+ * with a ENOMEM return - which indicates no slots could accommodate
+ * the log record.
+ */
+int
+__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
+ uint32_t flags, WT_MYSLOT *myslotp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t cur_state, new_state, old_state;
+ uint32_t allocated_slot, slot_grow_attempts;
+
+ conn = S2C(session);
+ log = conn->log;
+ slot_grow_attempts = 0;
+find_slot:
+ allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE;
+ slot = log->slot_array[allocated_slot];
+ old_state = slot->slot_state;
+join_slot:
+ /*
+ * WT_LOG_SLOT_READY and higher means the slot is available for
+ * joining. Any other state means it is in use and transitioning
+ * from the active array.
+ */
+ if (old_state < WT_LOG_SLOT_READY) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_transitions);
+ goto find_slot;
+ }
+ /*
+ * Add in our size to the state and then atomically swap that
+ * into place if it is still the same value.
+ */
+ new_state = old_state + (int64_t)mysize;
+ if (new_state < old_state) {
+ /* Our size doesn't fit here. */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
+ goto find_slot;
+ }
+ /*
+ * If the slot buffer isn't big enough to hold this update, mark
+ * the slot for a buffer size increase and find another slot.
+ */
+ if (new_state > (int64_t)slot->slot_buf.memsize) {
+ F_SET(slot, SLOT_BUF_GROW);
+ if (++slot_grow_attempts > 5) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
+ return (ENOMEM);
+ }
+ goto find_slot;
+ }
+ cur_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, old_state, new_state);
+ /*
+ * We lost a race to add our size into this slot. Check the state
+ * and try again.
+ */
+ if (cur_state != old_state) {
+ old_state = cur_state;
+ WT_STAT_FAST_CONN_INCR(session, log_slot_races);
+ goto join_slot;
+ }
+ WT_ASSERT(session, myslotp != NULL);
+ /*
+ * We joined this slot. Fill in our information to return to
+ * the caller.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(slot, SLOT_SYNC);
+ myslotp->slot = slot;
+ myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY;
+ return (0);
+}
+
+/*
+ * __wt_log_slot_close --
+ * Close a slot and do not allow any other threads to join this slot.
+ * Remove this from the active slot array and move a new slot from
+ * the pool into its place. Set up the size of this group;
+ * Must be called with the logging spinlock held.
+ */
+int
+__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *newslot;
+ int64_t old_state;
+ int32_t yields;
+ uint32_t pool_i, switch_fails;
+
+ conn = S2C(session);
+ log = conn->log;
+ switch_fails = 0;
+retry:
+ /*
+ * Find an unused slot in the pool.
+ */
+ pool_i = log->pool_index;
+ newslot = &log->slot_pool[pool_i];
+ if (++log->pool_index >= SLOT_POOL)
+ log->pool_index = 0;
+ if (newslot->slot_state != WT_LOG_SLOT_FREE) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_switch_fails);
+ /*
+ * If it takes a number of attempts to find an available slot
+ * it's likely all slots are waiting to be released. This
+ * churn is used to change how long we pause before closing
+ * the slot - which leads to more consolidation and less churn.
+ */
+ if (++switch_fails % SLOT_POOL == 0 &&
+ switch_fails != 0 && slot->slot_churn < 5)
+ ++slot->slot_churn;
+ __wt_yield();
+ goto retry;
+ } else if (slot->slot_churn > 0) {
+ --slot->slot_churn;
+ WT_ASSERT(session, slot->slot_churn >= 0);
+ }
+
+ /* Pause to allow other threads a chance to consolidate. */
+ for (yields = slot->slot_churn; yields >= 0; yields--)
+ __wt_yield();
+
+ /*
+ * Swap out the slot we're going to use and put a free one in the
+ * slot array in its place so that threads can use it right away.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
+ newslot->slot_state = WT_LOG_SLOT_READY;
+ newslot->slot_index = slot->slot_index;
+ log->slot_array[newslot->slot_index] = &log->slot_pool[pool_i];
+ old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING);
+ slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
+ /*
+ * Note that this statistic may be much bigger than in reality,
+ * especially when compared with the total bytes written in
+ * __log_fill. The reason is that this size reflects any
+ * rounding up that is needed and the total bytes in __log_fill
+ * is the amount of user bytes.
+ */
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_slot_consolidated, (uint64_t)slot->slot_group_size);
+ return (0);
+}
+
+/*
+ * __wt_log_slot_notify --
+ * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
+ */
+int
+__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_UNUSED(session);
+
+ slot->slot_state =
+ (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+ return (0);
+}
+
+/*
+ * __wt_log_slot_wait --
+ * Wait for slot leader to allocate log area and tell us our log offset.
+ */
+int
+__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+{
+ WT_UNUSED(session);
+
+ while (slot->slot_state > WT_LOG_SLOT_DONE)
+ __wt_yield();
+ return (0);
+}
+
+/*
+ * __wt_log_slot_release --
+ * Each thread in a consolidated group releases its portion to
+ * signal it has completed writing its piece of the log.
+ */
+int64_t
+__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
+{
+ int64_t newsize;
+
+ /*
+ * Add my size into the state. When it reaches WT_LOG_SLOT_DONE
+ * all participatory threads have completed copying their piece.
+ */
+ newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size);
+ return (newsize);
+}
+
+/*
+ * __wt_log_slot_free --
+ * Free a slot back into the pool.
+ */
+int
+__wt_log_slot_free(WT_LOGSLOT *slot)
+{
+ slot->slot_state = WT_LOG_SLOT_FREE;
+ return (0);
+}
+
+/*
+ * __wt_log_slot_grow_buffers --
+ * Increase the buffer size of all available slots in the buffer pool.
+ * Go to some lengths to include active (but unused) slots to handle
+ * the case where all log write record sizes exceed the size of the
+ * active buffer.
+ */
+int
+__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t orig_state;
+ uint64_t old_size, total_growth;
+ int i;
+
+ conn = S2C(session);
+ log = conn->log;
+ total_growth = 0;
+ WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
+ /*
+ * Take the log slot lock to prevent other threads growing buffers
+ * at the same time. Could tighten the scope of this lock, or have
+ * a separate lock if there is contention.
+ */
+ __wt_spin_lock(session, &log->log_slot_lock);
+ for (i = 0; i < SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ /* Avoid atomic operations if they won't succeed. */
+ if (slot->slot_state != WT_LOG_SLOT_FREE &&
+ slot->slot_state != WT_LOG_SLOT_READY)
+ continue;
+ /* Don't keep growing unrelated buffers. */
+ if (slot->slot_buf.memsize > (10 * newsize) &&
+ !F_ISSET(slot, SLOT_BUF_GROW))
+ continue;
+ orig_state = WT_ATOMIC_CAS_VAL8(
+ slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING);
+ if (orig_state != WT_LOG_SLOT_FREE) {
+ orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state,
+ WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING);
+ if (orig_state != WT_LOG_SLOT_READY)
+ continue;
+ }
+
+ /* We have a slot - now go ahead and grow the buffer. */
+ old_size = slot->slot_buf.memsize;
+ F_CLR(slot, SLOT_BUF_GROW);
+ WT_ERR(__wt_buf_grow(session, &slot->slot_buf,
+ WT_MAX(slot->slot_buf.memsize * 2, newsize)));
+ slot->slot_state = orig_state;
+ total_growth += slot->slot_buf.memsize - old_size;
+ }
+err: __wt_spin_unlock(session, &log->log_slot_lock);
+ WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
new file mode 100644
index 00000000000..f50706fb2e9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -0,0 +1,1519 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define WT_FORALL_CURSORS(clsm, c, i) \
+ for ((i) = (clsm)->nchunks; (i) > 0;) \
+ if (((c) = (clsm)->cursors[--i]) != NULL)
+
+#define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \
+ __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp)
+
+static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *);
+static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t);
+static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *);
+
+/*
+ * __clsm_enter_update --
+ * Make sure an LSM cursor is ready to perform an update.
+ */
+static int
+__clsm_enter_update(WT_CURSOR_LSM *clsm)
+{
+ WT_CURSOR *primary;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *primary_chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ int have_primary, ovfl, waited;
+
+ lsm_tree = clsm->lsm_tree;
+ if (clsm->nchunks == 0 ||
+ (primary = clsm->cursors[clsm->nchunks - 1]) == NULL)
+ return (0);
+ session = (WT_SESSION_IMPL *)primary->session;
+ primary_chunk = clsm->primary_chunk;
+ have_primary = (primary_chunk != NULL &&
+ primary_chunk->switch_txn == WT_TXN_NONE);
+ ovfl = 0;
+
+ /*
+ * In LSM there are multiple btrees active at one time. The tree
+ * switch code needs to use btree API methods, and it wants to
+ * operate on the btree for the primary chunk. Set that up now.
+ *
+ * If the primary chunk has grown too large, set a flag so the worker
+ * thread will switch when it gets a chance to avoid introducing high
+ * latency into application threads. Don't do this indefinitely: if a
+ * chunk grows twice as large as the configured size, block until it
+ * can be switched.
+ */
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ if (have_primary)
+ WT_WITH_BTREE(session,
+ ((WT_CURSOR_BTREE *)primary)->btree,
+ ovfl = __wt_btree_size_overflow(
+ session, lsm_tree->chunk_size));
+
+ if (ovfl || !have_primary) {
+ /*
+ * Check that we are up-to-date: don't set the switch
+ * if the tree has changed since we last opened
+ * cursors: that can lead to switching multiple times
+ * when only one switch is required, creating very
+ * small chunks.
+ */
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ if (clsm->dsk_gen == lsm_tree->dsk_gen &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ ret = __wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ }
+ WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+ WT_RET(ret);
+ ovfl = 0;
+ }
+ } else if (have_primary)
+ WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
+ ovfl = __wt_btree_size_overflow(
+ session, 2 * lsm_tree->chunk_size));
+
+ /*
+ * If there is no primary chunk, or it has really overflowed, which
+ * either means a worker thread has fallen behind or there has just
+ * been a user-level checkpoint, wait until the tree changes.
+ *
+ * We used to switch chunks in the application thread if we got to
+ * here, but that is problematic because there is a transaction in
+ * progress and it could roll back, leaving the metadata inconsistent.
+ */
+ if (ovfl || !have_primary) {
+ for (waited = 0;
+ clsm->dsk_gen == lsm_tree->dsk_gen;
+ ++waited) {
+ if (waited % 100 == 0)
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ __wt_sleep(0, 10);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_enter --
+ * Start an operation on an LSM cursor, update if the tree has changed.
+ */
+static inline int
+__clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ uint64_t *switch_txnp;
+ uint64_t snap_min;
+
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ /* Merge cursors never update. */
+ if (F_ISSET(clsm, WT_CLSM_MERGE))
+ return (0);
+
+ if (reset) {
+ WT_ASSERT(session, !F_ISSET(&clsm->iface,
+ WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
+ WT_RET(__clsm_reset_cursors(clsm, NULL));
+ }
+
+ for (;;) {
+ /*
+ * If the cursor looks up-to-date, check if the cache is full.
+ * In case this call blocks, the check will be repeated before
+ * proceeding.
+ */
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ WT_RET(__wt_cache_full_check(session));
+
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ /* Update the maximum transaction ID in the primary chunk. */
+ if (update) {
+ WT_RET(__clsm_enter_update(clsm));
+ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
+ goto open;
+
+ /*
+ * Ensure that there is a transaction snapshot active.
+ */
+ WT_RET(__wt_txn_autocommit_check(session));
+
+ if (session->txn.isolation == TXN_ISO_SNAPSHOT)
+ __wt_txn_cursor_op(session);
+
+ /*
+ * Figure out how many updates are required for
+ * snapshot isolation.
+ *
+ * This is not a normal visibility check on the maximum
+ * transaction ID in each chunk: any transaction ID
+ * that overlaps with our snapshot is a potential
+ * conflict.
+ */
+ clsm->nupdates = 1;
+ if (session->txn.isolation == TXN_ISO_SNAPSHOT &&
+ F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+ WT_ASSERT(session,
+ F_ISSET(&session->txn, TXN_HAS_SNAPSHOT));
+ snap_min = session->txn.snap_min;
+ for (switch_txnp =
+ &clsm->switch_txn[clsm->nchunks - 2];
+ clsm->nupdates < clsm->nchunks;
+ clsm->nupdates++, switch_txnp--) {
+ if (TXNID_LT(*switch_txnp, snap_min))
+ break;
+ WT_ASSERT(session,
+ !__wt_txn_visible_all(
+ session, *switch_txnp));
+ }
+ }
+ }
+
+ /*
+ * Stop when we are up-to-date, as long as this is:
+ * - a snapshot isolation update and the cursor is set up for
+ * that;
+ * - an update operation with a primary chunk, or
+ * - a read operation and the cursor is open for reading.
+ */
+ if ((!update ||
+ session->txn.isolation != TXN_ISO_SNAPSHOT ||
+ F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
+ ((update && clsm->primary_chunk != NULL) ||
+ (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
+ break;
+
+open: WT_WITH_SCHEMA_LOCK(session,
+ ret = __clsm_open_cursors(clsm, update, 0, 0));
+ WT_RET(ret);
+ }
+
+ if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ WT_RET(__cursor_enter(session));
+ F_SET(clsm, WT_CLSM_ACTIVE);
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_leave --
+ * Finish an operation on an LSM cursor.
+ */
+static int
+__clsm_leave(WT_CURSOR_LSM *clsm)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ if (F_ISSET(clsm, WT_CLSM_ACTIVE)) {
+ WT_RET(__cursor_leave(session));
+ F_CLR(clsm, WT_CLSM_ACTIVE);
+ }
+
+ return (0);
+}
+
+/*
+ * We need a tombstone to mark deleted records, and we use the special
+ * value below for that purpose. We use two 0x14 (Device Control 4) bytes to
+ * minimize the likelihood of colliding with an application-chosen encoding
+ * byte, if the application uses two leading DC4 byte for some reason, we'll do
+ * a wasted data copy each time a new value is inserted into the object.
+ */
+static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 };
+
+/*
+ * __clsm_deleted --
+ * Check whether the current value is a tombstone.
+ */
+static inline int
+__clsm_deleted(WT_CURSOR_LSM *clsm, const WT_ITEM *item)
+{
+ return (!F_ISSET(clsm, WT_CLSM_MINOR_MERGE) &&
+ item->size == __tombstone.size &&
+ memcmp(item->data, __tombstone.data, __tombstone.size) == 0);
+}
+
+/*
+ * __clsm_deleted_encode --
+ * Encode values that are in the encoded name space.
+ */
+static inline int
+__clsm_deleted_encode(WT_SESSION_IMPL *session,
+ const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp)
+{
+ WT_ITEM *tmp;
+
+ /*
+ * If value requires encoding, get a scratch buffer of the right size
+ * and create a copy of the data with the first byte of the tombstone
+ * appended.
+ */
+ if (value->size >= __tombstone.size &&
+ memcmp(value->data, __tombstone.data, __tombstone.size) == 0) {
+ WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp));
+ tmp = *tmpp;
+
+ memcpy(tmp->mem, value->data, value->size);
+ memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1);
+ final_value->data = tmp->mem;
+ final_value->size = value->size + 1;
+ } else {
+ final_value->data = value->data;
+ final_value->size = value->size;
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_deleted_decode --
+ * Decode values that start with the tombstone.
+ */
+static inline void
+__clsm_deleted_decode(WT_ITEM *value)
+{
+ /*
+ * Take care with this check: when an LSM cursor is used for a merge,
+ * and/or to create a Bloom filter, it is valid to return the tombstone
+ * value.
+ */
+ if (value->size > __tombstone.size &&
+ memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
+ --value->size;
+}
+
+/*
+ * __clsm_close_cursors --
+ * Close any btree cursors that are not needed.
+ */
+static int
+__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *c;
+ u_int i;
+
+ if (clsm->cursors == NULL || clsm->nchunks == 0)
+ return (0);
+
+ /*
+ * Walk the cursors, closing any we don't need. Note that the exit
+ * condition here is special, don't use WT_FORALL_CURSORS, and be
+ * careful with unsigned integer wrapping.
+ */
+ for (i = start; i < end; i++) {
+ if ((c = (clsm)->cursors[i]) != NULL) {
+ clsm->cursors[i] = NULL;
+ WT_RET(c->close(c));
+ }
+ if ((bloom = clsm->blooms[i]) != NULL) {
+ clsm->blooms[i] = NULL;
+ WT_RET(__wt_bloom_close(bloom));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_open_cursors --
+ * Open cursors for the current set of files.
+ */
+static int
+__clsm_open_cursors(
+ WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *c, **cp, *primary;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+ const char *checkpoint, *ckpt_cfg[3];
+ uint64_t saved_gen;
+ u_int i, nchunks, ngood, nupdates;
+ int locked;
+
+ c = &clsm->iface;
+ session = (WT_SESSION_IMPL *)c->session;
+ txn = &session->txn;
+ lsm_tree = clsm->lsm_tree;
+ chunk = NULL;
+
+ ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
+ ckpt_cfg[2] = NULL;
+
+ /* Copy the key, so we don't lose the cursor position. */
+ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
+ WT_RET(__wt_buf_set(
+ session, &c->key, c->key.data, c->key.size));
+
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ if (update) {
+ if (txn->isolation == TXN_ISO_SNAPSHOT)
+ F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
+ } else
+ F_SET(clsm, WT_CLSM_OPEN_READ);
+
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+
+ /*
+ * If there is no in-memory chunk in the tree for an update operation,
+ * create one.
+ *
+ * !!!
+ * It is exceeding unlikely that we get here at all, but if we were to
+ * switch chunks in this thread and our transaction roll back, it would
+ * leave the metadata inconsistent. Signal for the LSM worker thread
+ * to create the chunk instead to avoid the issue.
+ */
+ if (update && (lsm_tree->nchunks == 0 ||
+ (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
+ chunk->switch_txn != WT_TXN_NONE)) {
+ /* Release our lock because switch will get a write lock. */
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /*
+ * Give the worker thread a chance to run before locking the
+ * tree again -- we will loop in __clsm_enter until there is an
+ * in-memory chunk in the tree.
+ */
+ __wt_sleep(0, 1000);
+ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+ }
+
+ /* Merge cursors have already figured out how many chunks they need. */
+retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) {
+ nchunks = clsm->nchunks;
+ ngood = 0;
+
+ /*
+ * We may have raced with another merge completing. Check that
+ * we're starting at the right offset in the chunk array.
+ */
+ if (start_chunk >= lsm_tree->nchunks ||
+ lsm_tree->chunk[start_chunk]->id != start_id) {
+ for (start_chunk = 0;
+ start_chunk < lsm_tree->nchunks;
+ start_chunk++) {
+ chunk = lsm_tree->chunk[start_chunk];
+ if (chunk->id == start_id)
+ break;
+ }
+ /* We have to find the start chunk: merge locked it. */
+ WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
+ }
+
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+ } else {
+ nchunks = lsm_tree->nchunks;
+
+ /*
+ * If we are only opening the cursor for updates, only open the
+ * primary chunk, plus any other chunks that might be required
+ * to detect snapshot isolation conflicts.
+ */
+ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+ WT_ERR(__wt_realloc_def(session,
+ &clsm->txnid_alloc, nchunks,
+ &clsm->switch_txn));
+ if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
+ ngood = nupdates = 0;
+ else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
+ /*
+ * Keep going until all updates in the next
+ * chunk are globally visible. Copy the maximum
+ * transaction IDs into the cursor as we go.
+ */
+ for (ngood = nchunks - 1, nupdates = 1;
+ ngood > 0;
+ ngood--, nupdates++) {
+ chunk = lsm_tree->chunk[ngood - 1];
+ clsm->switch_txn[ngood - 1] = chunk->switch_txn;
+ if (__wt_txn_visible_all(
+ session, chunk->switch_txn))
+ break;
+ }
+ } else {
+ nupdates = 1;
+ ngood = nchunks - 1;
+ }
+
+ /* Check how many cursors are already open. */
+ for (cp = clsm->cursors + ngood;
+ ngood < clsm->nchunks && ngood < nchunks;
+ cp++, ngood++) {
+ chunk = lsm_tree->chunk[ngood];
+
+ /* If the cursor isn't open yet, we're done. */
+ if (*cp == NULL)
+ break;
+
+ /* Easy case: the URIs don't match. */
+ if (strcmp((*cp)->uri, chunk->uri) != 0)
+ break;
+
+ /* Make sure the checkpoint config matches. */
+ checkpoint = ((WT_CURSOR_BTREE *)*cp)->
+ btree->dhandle->checkpoint;
+ if (checkpoint == NULL &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !chunk->empty)
+ break;
+
+ /* Make sure the Bloom config matches. */
+ if (clsm->blooms[ngood] == NULL &&
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ break;
+ }
+
+ /* Spurious generation bump? */
+ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
+ clsm->dsk_gen = lsm_tree->dsk_gen;
+ goto err;
+ }
+
+ /*
+ * Close any cursors we no longer need. If the cursor is a
+ * pure update cursor, close everything -- we usually only need
+ * a single chunk open in that case and we haven't walked all
+ * of the other slots in the loop above.
+ *
+ * Drop the LSM tree lock while we do this: if the cache is
+ * full, we may block while closing a cursor. Save the
+ * generation number and retry if it has changed under us.
+ */
+ if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
+ ngood = 0;
+ if (clsm->cursors != NULL && ngood < clsm->nchunks) {
+ saved_gen = lsm_tree->dsk_gen;
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
+ WT_ERR(__clsm_close_cursors(
+ clsm, ngood, clsm->nchunks));
+ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+ if (lsm_tree->dsk_gen != saved_gen)
+ goto retry;
+ }
+
+ /* Detach from our old primary. */
+ clsm->primary_chunk = NULL;
+ clsm->current = NULL;
+ }
+
+ WT_ERR(__wt_realloc_def(session,
+ &clsm->bloom_alloc, nchunks, &clsm->blooms));
+ WT_ERR(__wt_realloc_def(session,
+ &clsm->cursor_alloc, nchunks, &clsm->cursors));
+
+ clsm->nchunks = nchunks;
+
+ /* Open the cursors for chunks that have changed. */
+ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
+ chunk = lsm_tree->chunk[i + start_chunk];
+ /* Copy the maximum transaction ID. */
+ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
+ clsm->switch_txn[i] = chunk->switch_txn;
+
+ /*
+ * Read from the checkpoint if the file has been written.
+ * Once all cursors switch, the in-memory tree can be evicted.
+ */
+ WT_ASSERT(session, *cp == NULL);
+ ret = __wt_open_cursor(session, chunk->uri, c,
+ (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
+ ckpt_cfg : NULL, cp);
+
+ /*
+ * XXX kludge: we may have an empty chunk where no checkpoint
+ * was written. If so, try to open the ordinary handle on that
+ * chunk instead.
+ */
+ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ ret = __wt_open_cursor(
+ session, chunk->uri, c, NULL, cp);
+ if (ret == 0)
+ chunk->empty = 1;
+ }
+ WT_ERR(ret);
+
+ /*
+ * Setup all cursors other than the primary to only do conflict
+ * checks on insert operations. This allows us to execute
+ * inserts on non-primary chunks as a way of checking for
+ * write conflicts with concurrent updates.
+ */
+ if (i != nchunks - 1)
+ (*cp)->insert = __wt_curfile_update_check;
+
+ if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
+ lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count,
+ c, &clsm->blooms[i]));
+
+ /* Child cursors always use overwrite and raw mode. */
+ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
+ }
+
+ /* The last chunk is our new primary. */
+ if (chunk != NULL &&
+ !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ chunk->switch_txn == WT_TXN_NONE) {
+ clsm->primary_chunk = chunk;
+ primary = clsm->cursors[clsm->nchunks - 1];
+ /*
+ * Disable eviction for the in-memory chunk. Also clear the
+ * bulk load flag here, otherwise eviction will be enabled by
+ * the first update.
+ */
+ btree = ((WT_CURSOR_BTREE *)(primary))->btree;
+ if (btree->bulk_load_ok) {
+ btree->bulk_load_ok = 0;
+ WT_WITH_BTREE(session, btree,
+ __wt_btree_evictable(session, 0));
+ }
+ }
+
+ clsm->dsk_gen = lsm_tree->dsk_gen;
+
+err:
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that all cursors are open as expected. */
+ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
+ for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
+ chunk = lsm_tree->chunk[i + start_chunk];
+
+ /* Make sure the cursor is open. */
+ WT_ASSERT(session, *cp != NULL);
+
+ /* Easy case: the URIs should match. */
+ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);
+
+ /* Make sure the checkpoint config matches. */
+ checkpoint = ((WT_CURSOR_BTREE *)*cp)->
+ btree->dhandle->checkpoint;
+ WT_ASSERT(session,
+ (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !chunk->empty) ?
+ checkpoint != NULL : checkpoint == NULL);
+
+ /* Make sure the Bloom config matches. */
+ WT_ASSERT(session,
+ (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
+ !F_ISSET(clsm, WT_CLSM_MERGE)) ?
+ clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
+ }
+ }
+#endif
+ if (locked)
+ WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_clsm_init_merge --
+ * Initialize an LSM cursor for a merge.
+ */
+int
+__wt_clsm_init_merge(
+ WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ F_SET(clsm, WT_CLSM_MERGE);
+ if (start_chunk != 0)
+ F_SET(clsm, WT_CLSM_MINOR_MERGE);
+ clsm->nchunks = nchunks;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __clsm_open_cursors(clsm, 0, start_chunk, start_id));
+ return (ret);
+}
+
+/*
+ * __clsm_get_current --
+ * Find the smallest / largest of the cursors and copy its key/value.
+ */
+static int
+__clsm_get_current(
+ WT_SESSION_IMPL *session, WT_CURSOR_LSM *clsm, int smallest, int *deletedp)
+{
+ WT_CURSOR *c, *current;
+ int cmp, multiple;
+ u_int i;
+
+ current = NULL;
+ multiple = 0;
+
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (current == NULL) {
+ current = c;
+ continue;
+ }
+ WT_RET(WT_LSM_CURCMP(session, clsm->lsm_tree, c, current, cmp));
+ if (smallest ? cmp < 0 : cmp > 0) {
+ current = c;
+ multiple = 0;
+ } else if (cmp == 0)
+ multiple = 1;
+ }
+
+ c = &clsm->iface;
+ if ((clsm->current = current) == NULL) {
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ return (WT_NOTFOUND);
+ }
+
+ if (multiple)
+ F_SET(clsm, WT_CLSM_MULTIPLE);
+ else
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+
+ WT_RET(current->get_key(current, &c->key));
+ WT_RET(current->get_value(current, &c->value));
+
+ F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if ((*deletedp = __clsm_deleted(clsm, &c->value)) == 0)
+ F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+
+ return (0);
+}
+
+/*
+ * __clsm_compare --
+ * WT_CURSOR->compare implementation for the LSM cursor type.
+ */
+static int
+__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
+{
+ WT_CURSOR_LSM *alsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
+ alsm = (WT_CURSOR_LSM *)a;
+ CURSOR_API_CALL(a, session, compare, NULL);
+
+ /*
+ * Confirm both cursors refer to the same source and have keys, then
+ * compare the keys.
+ */
+ if (strcmp(a->uri, b->uri) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "comparison method cursors must reference the same object");
+
+ WT_CURSOR_NEEDKEY(a);
+ WT_CURSOR_NEEDKEY(b);
+
+ WT_ERR(__wt_compare(
+ session, alsm->lsm_tree->collator, &a->key, &b->key, cmpp));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __clsm_next --
+ * WT_CURSOR->next method for the LSM cursor type.
+ */
+static int
+__clsm_next(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int check, cmp, deleted;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, next, NULL);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 0));
+
+ /* If we aren't positioned for a forward scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->next(c);
+ } else if (c != clsm->current) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == 0) {
+ if (cmp < 0)
+ ret = c->next(c);
+ else if (cmp == 0) {
+ if (clsm->current == NULL)
+ clsm->current = c;
+ else
+ F_SET(clsm,
+ WT_CLSM_MULTIPLE);
+ }
+ } else
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_NEXT);
+ F_CLR(clsm, WT_CLSM_ITERATE_PREV);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
+retry: /*
+ * If there are multiple cursors on that key, move them
+ * forward.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ check = 0;
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (check) {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, clsm->current,
+ cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+ if (c == clsm->current)
+ check = 1;
+ }
+ }
+
+ /* Move the smallest cursor forward. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ }
+
+ /* Find the cursor(s) with the smallest key. */
+ if ((ret = __clsm_get_current(session, clsm, 1, &deleted)) == 0 &&
+ deleted)
+ goto retry;
+
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0)
+ __clsm_deleted_decode(&cursor->value);
+ return (ret);
+}
+
+/*
+ * __clsm_prev --
+ * WT_CURSOR->prev method for the LSM cursor type.
+ */
+static int
+__clsm_prev(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int check, cmp, deleted;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, prev, NULL);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 0));
+
+ /* If we aren't positioned for a reverse scan, get started. */
+ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_PREV)) {
+ F_CLR(clsm, WT_CLSM_MULTIPLE);
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
+ WT_ERR(c->reset(c));
+ ret = c->prev(c);
+ } else if (c != clsm->current) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == 0) {
+ if (cmp > 0)
+ ret = c->prev(c);
+ else if (cmp == 0) {
+ if (clsm->current == NULL)
+ clsm->current = c;
+ else
+ F_SET(clsm,
+ WT_CLSM_MULTIPLE);
+ }
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ F_SET(clsm, WT_CLSM_ITERATE_PREV);
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT);
+
+ /* We just positioned *at* the key, now move. */
+ if (clsm->current != NULL)
+ goto retry;
+ } else {
+retry: /*
+ * If there are multiple cursors on that key, move them
+ * backwards.
+ */
+ if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) {
+ check = 0;
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (!F_ISSET(c, WT_CURSTD_KEY_INT))
+ continue;
+ if (check) {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, clsm->current,
+ cmp));
+ if (cmp == 0)
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+ if (c == clsm->current)
+ check = 1;
+ }
+ }
+
+ /* Move the smallest cursor backwards. */
+ c = clsm->current;
+ WT_ERR_NOTFOUND_OK(c->prev(c));
+ }
+
+ /* Find the cursor(s) with the largest key. */
+ if ((ret = __clsm_get_current(session, clsm, 0, &deleted)) == 0 &&
+ deleted)
+ goto retry;
+
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0)
+ __clsm_deleted_decode(&cursor->value);
+ return (ret);
+}
+
+/*
+ * __clsm_reset_cursors --
+ * Reset any positioned chunk cursors.
+ *
+ * If the skip parameter is non-NULL, that cursor is about to be used, so
+ * there is no need to reset it.
+ */
+static int
+__clsm_reset_cursors(WT_CURSOR_LSM *clsm, WT_CURSOR *skip)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ u_int i;
+
+ /* Fast path if the cursor is not positioned. */
+ if ((clsm->current == NULL || clsm->current == skip) &&
+ !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV))
+ return (0);
+
+ WT_FORALL_CURSORS(clsm, c, i) {
+ if (c == skip)
+ continue;
+ if (F_ISSET(c, WT_CURSTD_KEY_INT))
+ WT_TRET(c->reset(c));
+ }
+
+ clsm->current = NULL;
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ return (ret);
+}
+
+/*
+ * __clsm_reset --
+ * WT_CURSOR->reset method for the LSM cursor type.
+ */
+static int
+__clsm_reset(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when all
+ * we want to do is give up our position.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL(cursor, session, reset, NULL);
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+
+ WT_TRET(__clsm_reset_cursors(clsm, NULL));
+
+ /* In case we were left positioned, clear that. */
+ WT_TRET(__clsm_leave(clsm));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __clsm_lookup --
+ * Position an LSM cursor.
+ */
+static int
+__clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
+{
+ WT_BLOOM *bloom;
+ WT_BLOOM_HASH bhash;
+ WT_CURSOR *c, *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int have_hash;
+
+ c = NULL;
+ cursor = &clsm->iface;
+ have_hash = 0;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_FORALL_CURSORS(clsm, c, i) {
+ /* If there is a Bloom filter, see if we can skip the read. */
+ bloom = NULL;
+ if ((bloom = clsm->blooms[i]) != NULL) {
+ if (!have_hash) {
+ WT_ERR(__wt_bloom_hash(
+ bloom, &cursor->key, &bhash));
+ have_hash = 1;
+ }
+
+ ret = __wt_bloom_hash_get(bloom, &bhash);
+ if (ret == WT_NOTFOUND) {
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, bloom_miss);
+ continue;
+ } else if (ret == 0)
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, bloom_hit);
+ WT_ERR(ret);
+ }
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search(c)) == 0) {
+ WT_ERR(c->get_key(c, &cursor->key));
+ WT_ERR(c->get_value(c, value));
+ if (__clsm_deleted(clsm, value))
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ /* Update stats: the active chunk can't have a bloom filter. */
+ if (bloom != NULL)
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, bloom_false_positive);
+ else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
+ WT_STAT_FAST_INCR(session,
+ &clsm->lsm_tree->stats, lsm_lookup_no_bloom);
+ }
+ WT_ERR(WT_NOTFOUND);
+
+done:
+err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if (ret == 0) {
+ clsm->current = c;
+ F_SET(cursor, WT_CURSTD_KEY_INT);
+ if (value == &cursor->value)
+ F_SET(cursor, WT_CURSTD_VALUE_INT);
+ } else if (c != NULL)
+ WT_TRET(c->reset(c));
+
+ return (ret);
+}
+
+/*
+ * __clsm_search --
+ * WT_CURSOR->search method for the LSM cursor type.
+ */
+static int
+__clsm_search(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, search, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 1, 0));
+
+ ret = __clsm_lookup(clsm, &cursor->value);
+
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0)
+ __clsm_deleted_decode(&cursor->value);
+ return (ret);
+}
+
+/*
+ * __clsm_search_near --
+ * WT_CURSOR->search_near method for the LSM cursor type.
+ */
+static int
+__clsm_search_near(WT_CURSOR *cursor, int *exactp)
+{
+ WT_CURSOR *c, *larger, *smaller;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM v;
+ WT_SESSION_IMPL *session;
+ u_int i;
+ int cmp, deleted;
+
+ larger = smaller = NULL;
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_API_CALL(cursor, session, search_near, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 1, 0));
+ F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);
+
+ /*
+ * search_near is somewhat fiddly: we can't just use a nearby key from
+ * the in-memory chunk because there could be a closer key on disk.
+ *
+ * As we search down the chunks, we stop as soon as we find an exact
+ * match. Otherwise, we maintain the smallest cursor larger than the
+ * search key and the largest cursor smaller than the search key. At
+ * the bottom, we prefer the larger cursor, but if no record is larger,
+ * use the smaller cursor, or if no record at all was found,
+ * WT_NOTFOUND.
+ */
+ WT_FORALL_CURSORS(clsm, c, i) {
+ c->set_key(c, &cursor->key);
+ if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
+ F_CLR(c, WT_CURSTD_KEY_SET);
+ ret = 0;
+ continue;
+ } else if (ret != 0)
+ goto err;
+
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(clsm, &v);
+
+ if (cmp == 0 && !deleted) {
+ clsm->current = c;
+ *exactp = 0;
+ goto done;
+ }
+
+ /*
+ * Prefer larger cursors. There are two reasons: (1) we expect
+ * prefix searches to be a common case (as in our own indices);
+ * and (2) we need a way to unambiguously know we have the
+ * "closest" result.
+ */
+ if (cmp < 0) {
+ if ((ret = c->next(c)) == 0)
+ cmp = 1;
+ else if (ret == WT_NOTFOUND)
+ ret = c->prev(c);
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * If we land on a deleted item, try going forwards or
+ * backwards to find one that isn't deleted.
+ */
+ while (deleted && (ret = c->next(c)) == 0) {
+ cmp = 1;
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(clsm, &v);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ while (deleted && (ret = c->prev(c)) == 0) {
+ cmp = -1;
+ WT_ERR(c->get_value(c, &v));
+ deleted = __clsm_deleted(clsm, &v);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if (deleted)
+ continue;
+
+ /*
+ * We are trying to find the smallest cursor greater than the
+ * search key, or, if there is no larger key, the largest
+ * cursor smaller than the search key.
+ *
+ * It could happen that one cursor contains both of the closest
+ * records. In that case, we will track it in "larger", and it
+ * will be the one we finally choose.
+ */
+ if (cmp > 0) {
+ if (larger == NULL)
+ larger = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, larger, cmp));
+ if (cmp < 0) {
+ WT_ERR(larger->reset(larger));
+ larger = c;
+ }
+ }
+ } else {
+ if (smaller == NULL)
+ smaller = c;
+ else {
+ WT_ERR(WT_LSM_CURCMP(session,
+ clsm->lsm_tree, c, smaller, cmp));
+ if (cmp > 0) {
+ WT_ERR(smaller->reset(smaller));
+ smaller = c;
+ }
+ }
+ }
+
+ if (c != smaller && c != larger)
+ WT_ERR(c->reset(c));
+ }
+
+ if (larger != NULL) {
+ clsm->current = larger;
+ larger = NULL;
+ *exactp = 1;
+ } else if (smaller != NULL) {
+ clsm->current = smaller;
+ smaller = NULL;
+ *exactp = -1;
+ } else
+ ret = WT_NOTFOUND;
+
+done:
+err: WT_TRET(__clsm_leave(clsm));
+ API_END(session, ret);
+ if (ret == 0) {
+ c = clsm->current;
+ WT_TRET(c->get_key(c, &cursor->key));
+ WT_TRET(c->get_value(c, &cursor->value));
+ }
+ if (smaller != NULL)
+ WT_TRET(smaller->reset(smaller));
+ if (larger != NULL)
+ WT_TRET(larger->reset(larger));
+
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ if (ret == 0) {
+ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
+ __clsm_deleted_decode(&cursor->value);
+ } else
+ clsm->current = NULL;
+
+ return (ret);
+}
+
+/*
+ * __clsm_put --
+ * Put an entry into the in-memory tree, trigger a file switch if
+ * necessary.
+ */
+static inline int
+__clsm_put(WT_SESSION_IMPL *session,
+ WT_CURSOR_LSM *clsm, const WT_ITEM *key, const WT_ITEM *value, int position)
+{
+ WT_CURSOR *c, *primary;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+
+ lsm_tree = clsm->lsm_tree;
+
+ WT_ASSERT(session,
+ clsm->primary_chunk != NULL &&
+ (clsm->primary_chunk->switch_txn == WT_TXN_NONE ||
+ TXNID_LE(session->txn.id, clsm->primary_chunk->switch_txn)));
+
+ /*
+ * Clear the existing cursor position. Don't clear the primary cursor:
+ * we're about to use it anyway.
+ */
+ primary = clsm->cursors[clsm->nchunks - 1];
+ WT_RET(__clsm_reset_cursors(clsm, primary));
+
+ /* If necessary, set the position for future scans. */
+ if (position)
+ clsm->current = primary;
+
+ for (i = 0; i < clsm->nupdates; i++) {
+ c = clsm->cursors[(clsm->nchunks - i) - 1];
+ c->set_key(c, key);
+ c->set_value(c, value);
+ WT_RET((position && i == 0) ? c->update(c) : c->insert(c));
+ }
+
+ /*
+ * Update the record count. It is in a shared structure, but it's only
+ * approximate, so don't worry about protecting access.
+ *
+ * Throttle if necessary. Every 100 update operations on each cursor,
+ * check if throttling is required. Don't rely only on the shared
+ * counter because it can race, and because for some workloads, there
+ * may not be enough records per chunk to get effective throttling.
+ */
+ if ((++clsm->primary_chunk->count % 100 == 0 ||
+ ++clsm->update_count >= 100) &&
+ lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
+ clsm->update_count = 0;
+ WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+ lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+ WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+ lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+ __wt_sleep(0,
+ lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
+ }
+
+ return (0);
+}
+
+/*
+ * __clsm_insert --
+ * WT_CURSOR->insert method for the LSM cursor type.
+ */
+static int
+__clsm_insert(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 1));
+
+ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
+ (ret = __clsm_lookup(clsm, &value)) != WT_NOTFOUND) {
+ if (ret == 0)
+ ret = WT_DUPLICATE_KEY;
+ goto err;
+ }
+
+ WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf));
+ ret = __clsm_put(session, clsm, &cursor->key, &value, 0);
+
+err: __wt_scr_free(&buf);
+ WT_TRET(__clsm_leave(clsm));
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __clsm_update --
+ * WT_CURSOR->update method for the LSM cursor type.
+ */
+static int
+__clsm_update(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, update, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NEEDVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 1));
+
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+ (ret = __clsm_lookup(clsm, &value)) == 0) {
+ WT_ERR(__clsm_deleted_encode(
+ session, &cursor->value, &value, &buf));
+ ret = __clsm_put(session, clsm, &cursor->key, &value, 1);
+ }
+
+err: __wt_scr_free(&buf);
+ WT_TRET(__clsm_leave(clsm));
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __clsm_remove --
+ * WT_CURSOR->remove method for the LSM cursor type.
+ */
+static int
+__clsm_remove(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_ITEM value;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+
+ CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL);
+ WT_CURSOR_NEEDKEY(cursor);
+ WT_CURSOR_NOVALUE(cursor);
+ WT_ERR(__clsm_enter(clsm, 0, 1));
+
+ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) ||
+ (ret = __clsm_lookup(clsm, &value)) == 0)
+ ret = __clsm_put(session, clsm, &cursor->key, &__tombstone, 1);
+
+err: WT_TRET(__clsm_leave(clsm));
+ CURSOR_UPDATE_API_END(session, ret);
+ return (ret);
+}
+
+/*
+ * __clsm_close --
+ * WT_CURSOR->close method for the LSM cursor type.
+ */
+static int
+__clsm_close(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ /*
+ * Don't use the normal __clsm_enter path: that is wasted work when
+ * closing, and the cursor may never have been used.
+ */
+ clsm = (WT_CURSOR_LSM *)cursor;
+ CURSOR_API_CALL(cursor, session, close, NULL);
+ WT_TRET(__clsm_close_cursors(clsm, 0, clsm->nchunks));
+ __wt_free(session, clsm->blooms);
+ __wt_free(session, clsm->cursors);
+ __wt_free(session, clsm->switch_txn);
+
+ /* In case we were somehow left positioned, clear that. */
+ WT_TRET(__clsm_leave(clsm));
+
+ /* The WT_LSM_TREE owns the URI. */
+ cursor->uri = NULL;
+ if (clsm->lsm_tree != NULL)
+ __wt_lsm_tree_release(session, clsm->lsm_tree);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __wt_clsm_open --
+ * WT_SESSION->open_cursor method for LSM cursors.
+ */
+int
+__wt_clsm_open(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __clsm_compare, /* compare */
+ __clsm_next, /* next */
+ __clsm_prev, /* prev */
+ __clsm_reset, /* reset */
+ __clsm_search, /* search */
+ __clsm_search_near, /* search-near */
+ __clsm_insert, /* insert */
+ __clsm_update, /* update */
+ __clsm_remove, /* remove */
+ __clsm_close); /* close */
+ WT_CURSOR *cursor;
+ WT_CURSOR_LSM *clsm;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ clsm = NULL;
+ cursor = NULL;
+
+ if (!WT_PREFIX_MATCH(uri, "lsm:"))
+ return (EINVAL);
+
+ WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0)
+ WT_RET_MSG(session, EINVAL,
+ "LSM does not support opening by checkpoint");
+
+ /* Get the LSM tree. */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+ WT_RET(ret);
+
+ WT_ERR(__wt_calloc_def(session, 1, &clsm));
+
+ cursor = &clsm->iface;
+ *cursor = iface;
+ cursor->session = &session->iface;
+ cursor->uri = lsm_tree->name;
+ cursor->key_format = lsm_tree->key_format;
+ cursor->value_format = lsm_tree->value_format;
+
+ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 0));
+
+ clsm->lsm_tree = lsm_tree;
+
+ /*
+ * The tree's dsk_gen starts at one, so starting the cursor on zero
+ * will force a call into open_cursors on the first operation.
+ */
+ clsm->dsk_gen = 0;
+
+ WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
+ WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
+
+ if (0) {
+err: __wt_lsm_tree_release(session, lsm_tree);
+ if (clsm != NULL) {
+ clsm->lsm_tree = NULL;
+ WT_TRET(__clsm_close(cursor));
+ }
+ }
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_manager.c b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
new file mode 100644
index 00000000000..8f4b3ba49ef
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_manager.c
@@ -0,0 +1,667 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_manager_aggressive_update(WT_SESSION_IMPL *, WT_LSM_TREE *);
+static int __lsm_manager_run_server(WT_SESSION_IMPL *);
+static int __lsm_manager_worker_setup(WT_SESSION_IMPL *);
+
+static void * __lsm_worker_manager(void *);
+
+/*
+ * __wt_lsm_manager_config --
+ * Configure the LSM manager.
+ */
+int
+__wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONFIG_ITEM cval;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "lsm_manager.merge", &cval));
+ if (cval.val)
+ F_SET(conn, WT_CONN_LSM_MERGE);
+ WT_RET(__wt_config_gets(
+ session, cfg, "lsm_manager.worker_thread_max", &cval));
+ if (cval.val)
+ conn->lsm_manager.lsm_workers_max = (uint32_t)cval.val;
+ return (0);
+}
+
+/*
+ * __lsm_general_worker_start --
+ * Start up all of the general LSM worker threads.
+ */
+static int
+__lsm_general_worker_start(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORKER_ARGS *worker_args;
+
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
+
+ /*
+ * Start the remaining worker threads.
+ * This should get more sophisticated in the future - only launching
+ * as many worker threads as are required to keep up with demand.
+ */
+ WT_ASSERT(session, manager->lsm_workers > 1);
+ for (; manager->lsm_workers < manager->lsm_workers_max;
+ manager->lsm_workers++) {
+ worker_args =
+ &manager->lsm_worker_cookies[manager->lsm_workers];
+ worker_args->work_cond = manager->work_cond;
+ worker_args->id = manager->lsm_workers;
+ worker_args->type =
+ WT_LSM_WORK_BLOOM |
+ WT_LSM_WORK_DROP |
+ WT_LSM_WORK_FLUSH |
+ WT_LSM_WORK_SWITCH;
+ F_SET(worker_args, WT_LSM_WORKER_RUN);
+ /*
+ * Only allow half of the threads to run merges to avoid all
+ * all workers getting stuck in long-running merge operations.
+ * Make sure the first worker is allowed, so that there is at
+ * least one thread capable of running merges. We know the
+ * first worker is id 2, so set merges on even numbered workers.
+ */
+ if (manager->lsm_workers % 2 == 0)
+ FLD_SET(worker_args->type, WT_LSM_WORK_MERGE);
+ WT_RET(__wt_lsm_worker_start(session, worker_args));
+ }
+ return (0);
+}
+
+/*
+ * __lsm_stop_workers --
+ * Stop worker threads until the number reaches the configured amount.
+ */
+static int
+__lsm_stop_workers(WT_SESSION_IMPL *session)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORKER_ARGS *worker_args;
+ uint32_t i;
+
+ manager = &S2C(session)->lsm_manager;
+ /*
+ * Start at the end of the list of threads and stop them until we
+ * have the desired number. We want to keep all active threads
+ * packed at the front of the worker array.
+ */
+ WT_ASSERT(session, manager->lsm_workers != 0);
+ for (i = manager->lsm_workers - 1; i >= manager->lsm_workers_max; i--) {
+ worker_args = &manager->lsm_worker_cookies[i];
+ /*
+ * Clear this worker's flag so it stops.
+ */
+ F_CLR(worker_args, WT_LSM_WORKER_RUN);
+ WT_ASSERT(session, worker_args->tid != 0);
+ WT_RET(__wt_thread_join(session, worker_args->tid));
+ worker_args->tid = 0;
+ worker_args->type = 0;
+ worker_args->flags = 0;
+ manager->lsm_workers--;
+ /*
+ * We do not clear the session because they are allocated
+ * statically when the connection was opened.
+ */
+ }
+ return (0);
+}
+
+/*
+ * __wt_lsm_manager_reconfig --
+ * Re-configure the LSM manager.
+ */
+int
+__wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg)
+{
+ WT_LSM_MANAGER *manager;
+ uint32_t orig_workers;
+
+ manager = &S2C(session)->lsm_manager;
+ orig_workers = manager->lsm_workers_max;
+
+ WT_RET(__wt_lsm_manager_config(session, cfg));
+ /*
+ * If LSM hasn't started yet, we simply reconfigured the settings
+ * and we'll let the normal code path start the threads.
+ */
+ if (manager->lsm_workers_max == 0)
+ return (0);
+ if (manager->lsm_workers == 0)
+ return (0);
+ /*
+ * If the number of workers has not changed, we're done.
+ */
+ if (orig_workers == manager->lsm_workers_max)
+ return (0);
+ /*
+ * If we want more threads, start them.
+ */
+ if (manager->lsm_workers_max > orig_workers)
+ return (__lsm_general_worker_start(session));
+
+ /*
+ * Otherwise we want to reduce the number of workers.
+ */
+ WT_ASSERT(session, manager->lsm_workers_max < orig_workers);
+ WT_RET(__lsm_stop_workers(session));
+ return (0);
+}
+
+/*
+ * __wt_lsm_manager_start --
+ * Start the LSM management infrastructure. Our queues and locks were
+ * initialized when the connection was initialized.
+ */
+int
+__wt_lsm_manager_start(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_MANAGER *manager;
+ WT_SESSION_IMPL *worker_session;
+ uint32_t i;
+
+ manager = &S2C(session)->lsm_manager;
+
+ /*
+ * We need at least a manager, a switch thread and a generic
+ * worker.
+ */
+ WT_ASSERT(session, manager->lsm_workers_max > 2);
+
+ /*
+ * Open sessions for all potential worker threads here - it's not
+ * safe to have worker threads open/close sessions themselves.
+ * All the LSM worker threads do their operations on read-only
+ * files. Use read-uncommitted isolation to avoid keeping
+ * updates in cache unnecessarily.
+ */
+ for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
+ WT_ERR(__wt_open_internal_session(
+ S2C(session), "lsm-worker", 1, 0, &worker_session));
+ worker_session->isolation = TXN_ISO_READ_UNCOMMITTED;
+ manager->lsm_worker_cookies[i].session = worker_session;
+ }
+
+ /* Start the LSM manager thread. */
+ WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid,
+ __lsm_worker_manager, &manager->lsm_worker_cookies[0]));
+
+ F_SET(S2C(session), WT_CONN_SERVER_LSM);
+
+ if (0) {
+err: for (i = 0;
+ (worker_session =
+ manager->lsm_worker_cookies[i].session) != NULL;
+ i++)
+ WT_TRET((&worker_session->iface)->close(
+ &worker_session->iface, NULL));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_manager_free_work_unit --
+ * Release an LSM tree work unit.
+ */
+void
+__wt_lsm_manager_free_work_unit(
+ WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry)
+{
+ if (entry != NULL) {
+ WT_ASSERT(session, entry->lsm_tree->queue_ref > 0);
+
+ (void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1);
+ __wt_free(session, entry);
+ }
+}
+
+/*
+ * __wt_lsm_manager_destroy --
+ * Destroy the LSM manager threads and subsystem.
+ */
+int
+__wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *current, *next;
+ WT_SESSION *wt_session;
+ uint32_t i;
+ uint64_t removed;
+
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
+ removed = 0;
+
+ if (manager->lsm_workers > 0) {
+ /*
+ * Stop the main LSM manager thread first.
+ */
+ while (F_ISSET(conn, WT_CONN_SERVER_LSM))
+ __wt_yield();
+
+ /* Clean up open LSM handles. */
+ ret = __wt_lsm_tree_close_all(session);
+
+ WT_TRET(__wt_thread_join(
+ session, manager->lsm_worker_cookies[0].tid));
+ manager->lsm_worker_cookies[0].tid = 0;
+
+ /* Release memory from any operations left on the queue. */
+ for (current = TAILQ_FIRST(&manager->switchqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ TAILQ_REMOVE(&manager->switchqh, current, q);
+ ++removed;
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ for (current = TAILQ_FIRST(&manager->appqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ TAILQ_REMOVE(&manager->appqh, current, q);
+ ++removed;
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ for (current = TAILQ_FIRST(&manager->managerqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ TAILQ_REMOVE(&manager->managerqh, current, q);
+ ++removed;
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+
+ /* Close all LSM worker sessions. */
+ for (i = 0; i < WT_LSM_MAX_WORKERS; i++) {
+ wt_session =
+ &manager->lsm_worker_cookies[i].session->iface;
+ WT_TRET(wt_session->close(wt_session, NULL));
+ }
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_work_units_discarded, removed);
+
+ /* Free resources that are allocated in connection initialize */
+ __wt_spin_destroy(session, &manager->switch_lock);
+ __wt_spin_destroy(session, &manager->app_lock);
+ __wt_spin_destroy(session, &manager->manager_lock);
+ WT_TRET(__wt_cond_destroy(session, &manager->work_cond));
+
+ return (ret);
+}
+
+/*
+ * __lsm_manager_aggressive_update --
+ * Update the merge aggressiveness for a single LSM tree.
+ */
+static int
+__lsm_manager_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ struct timespec now;
+ uint64_t chunk_wait, stallms;
+ u_int new_aggressive;
+
+ WT_RET(__wt_epoch(session, &now));
+ stallms = WT_TIMEDIFF(now, lsm_tree->last_flush_ts) / WT_MILLION;
+ /*
+ * Get aggressive if more than enough chunks for a merge should have
+ * been created by now. Use 10 seconds as a default if we don't have an
+ * estimate.
+ */
+ if (lsm_tree->nchunks > 1)
+ chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ?
+ 10000 : lsm_tree->chunk_fill_ms);
+ else
+ chunk_wait = 0;
+ new_aggressive = (u_int)(chunk_wait / lsm_tree->merge_min);
+
+ if (new_aggressive > lsm_tree->merge_aggressiveness) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM merge %s got aggressive (old %u new %u), "
+ "merge_min %d, %u / %" PRIu64,
+ lsm_tree->name, lsm_tree->merge_aggressiveness,
+ new_aggressive, lsm_tree->merge_min, stallms,
+ lsm_tree->chunk_fill_ms));
+ lsm_tree->merge_aggressiveness = new_aggressive;
+ }
+ return (0);
+}
+
+/*
+ * __lsm_manager_worker_setup --
+ * Do setup owned by the LSM manager thread including starting the worker
+ * threads.
+ */
+static int
+__lsm_manager_worker_setup(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORKER_ARGS *worker_args;
+
+ conn = S2C(session);
+ manager = &conn->lsm_manager;
+
+ WT_ASSERT(session, manager->lsm_workers == 1);
+ /*
+ * The LSM manager is worker[0]. The switch thread is worker[1].
+ * Setup and start the switch/drop worker explicitly.
+ */
+ worker_args = &manager->lsm_worker_cookies[1];
+ worker_args->work_cond = manager->work_cond;
+ worker_args->id = manager->lsm_workers++;
+ worker_args->type = WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH;
+ F_SET(worker_args, WT_LSM_WORKER_RUN);
+ /* Start the switch thread. */
+ WT_RET(__wt_lsm_worker_start(session, worker_args));
+ WT_RET(__lsm_general_worker_start(session));
+
+ return (0);
+}
+
+/*
+ * __lsm_manager_worker_shutdown --
+ * Shutdown the LSM manager and worker threads.
+ */
+static int
+__lsm_manager_worker_shutdown(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_MANAGER *manager;
+ u_int i;
+
+ manager = &S2C(session)->lsm_manager;
+
+ /*
+ * Wait for the rest of the LSM workers to shutdown. Stop at index
+ * one - since we (the manager) are at index 0.
+ */
+ for (i = 1; i < manager->lsm_workers; i++) {
+ WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0);
+ WT_TRET(__wt_cond_signal(session, manager->work_cond));
+ WT_TRET(__wt_thread_join(
+ session, manager->lsm_worker_cookies[i].tid));
+ }
+ return (ret);
+}
+
+/*
+ * __lsm_manager_run_server --
+ * Run manager thread operations.
+ */
+static int
+__lsm_manager_run_server(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LSM_TREE *lsm_tree;
+ struct timespec now;
+ uint64_t fillms, pushms;
+
+ conn = S2C(session);
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN)) {
+ if (TAILQ_EMPTY(&conn->lsmqh)) {
+ __wt_sleep(0, 10000);
+ continue;
+ }
+ __wt_sleep(0, 10000);
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) {
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ continue;
+ WT_RET(__lsm_manager_aggressive_update(
+ session, lsm_tree));
+ WT_RET(__wt_epoch(session, &now));
+ pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 :
+ WT_TIMEDIFF(
+ now, lsm_tree->work_push_ts) / WT_MILLION;
+ fillms = 3 * lsm_tree->chunk_fill_ms;
+ if (fillms == 0)
+ fillms = 10000;
+ /*
+ * If the tree appears to not be triggering enough
+ * LSM maintenance, help it out. Additional work units
+ * don't hurt, and can be necessary if some work
+ * units aren't completed for some reason.
+ * If the tree hasn't been modified, and there are
+ * more than 1 chunks - try to get the tree smaller
+ * so queries run faster.
+ * If we are getting aggressive - ensure there are
+ * enough work units that we can get chunks merged.
+ * If we aren't pushing enough work units, compared
+ * to how often new chunks are being created add some
+ * more.
+ */
+ if (lsm_tree->queue_ref >= LSM_TREE_MAX_QUEUE)
+ WT_STAT_FAST_CONN_INCR(session,
+ lsm_work_queue_max);
+ else if ((!lsm_tree->modified &&
+ lsm_tree->nchunks > 1) ||
+ (lsm_tree->queue_ref == 0 &&
+ lsm_tree->nchunks > 1) ||
+ (lsm_tree->merge_aggressiveness > 3 &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) ||
+ pushms > fillms) {
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_DROP, 0, lsm_tree));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "MGR %s: queue %d mod %d nchunks %d"
+ " flags 0x%x aggressive %d pushms %" PRIu64
+ " fillms %" PRIu64,
+ lsm_tree->name, lsm_tree->queue_ref,
+ lsm_tree->modified, lsm_tree->nchunks,
+ lsm_tree->flags,
+ lsm_tree->merge_aggressiveness,
+ pushms, fillms));
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __lsm_worker_manager --
+ * A thread that manages all open LSM trees, and the shared LSM worker
+ * threads.
+ */
+static void *
+__lsm_worker_manager(void *arg)
+{
+ WT_DECL_RET;
+ WT_LSM_WORKER_ARGS *cookie;
+ WT_SESSION_IMPL *session;
+
+ cookie = (WT_LSM_WORKER_ARGS *)arg;
+ session = cookie->session;
+
+ WT_ERR(__lsm_manager_worker_setup(session));
+ WT_ERR(__lsm_manager_run_server(session));
+ WT_ERR(__lsm_manager_worker_shutdown(session));
+
+ if (ret != 0) {
+err: __wt_err(session, ret, "LSM worker manager thread error");
+ }
+ F_CLR(S2C(session), WT_CONN_SERVER_LSM);
+ return (NULL);
+}
+
+/*
+ * __wt_lsm_manager_clear_tree --
+ * Remove all entries for a tree from the LSM manager queues. This
+ * introduces an inefficiency if LSM trees are being opened and closed
+ * regularly.
+ */
+int
+__wt_lsm_manager_clear_tree(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *current, *next;
+ uint64_t removed;
+
+ manager = &S2C(session)->lsm_manager;
+ removed = 0;
+
+ /* Clear out the tree from the switch queue */
+ __wt_spin_lock(session, &manager->switch_lock);
+
+ /* Structure the loop so that it's safe to free as we iterate */
+ for (current = TAILQ_FIRST(&manager->switchqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ if (current->lsm_tree != lsm_tree)
+ continue;
+ ++removed;
+ TAILQ_REMOVE(&manager->switchqh, current, q);
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ __wt_spin_unlock(session, &manager->switch_lock);
+ /* Clear out the tree from the application queue */
+ __wt_spin_lock(session, &manager->app_lock);
+ for (current = TAILQ_FIRST(&manager->appqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ if (current->lsm_tree != lsm_tree)
+ continue;
+ ++removed;
+ TAILQ_REMOVE(&manager->appqh, current, q);
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ __wt_spin_unlock(session, &manager->app_lock);
+ /* Clear out the tree from the manager queue */
+ __wt_spin_lock(session, &manager->manager_lock);
+ for (current = TAILQ_FIRST(&manager->managerqh);
+ current != NULL; current = next) {
+ next = TAILQ_NEXT(current, q);
+ if (current->lsm_tree != lsm_tree)
+ continue;
+ ++removed;
+ TAILQ_REMOVE(&manager->managerqh, current, q);
+ __wt_lsm_manager_free_work_unit(session, current);
+ }
+ __wt_spin_unlock(session, &manager->manager_lock);
+ WT_STAT_FAST_CONN_INCRV(session, lsm_work_units_discarded, removed);
+ return (0);
+}
+
+/*
+ * We assume this is only called from __wt_lsm_manager_pop_entry and we
+ * have session, entry and type available to use. If the queue is empty
+ * we may return from the macro.
+ */
+#define LSM_POP_ENTRY(qh, qlock, qlen) do { \
+ if (TAILQ_EMPTY(qh)) \
+ return (0); \
+ __wt_spin_lock(session, qlock); \
+ TAILQ_FOREACH(entry, (qh), q) { \
+ if (FLD_ISSET(type, entry->type)) { \
+ TAILQ_REMOVE(qh, entry, q); \
+ WT_STAT_FAST_CONN_DECR(session, qlen); \
+ break; \
+ } \
+ } \
+ __wt_spin_unlock(session, (qlock)); \
+} while (0)
+
+/*
+ * __wt_lsm_manager_pop_entry --
+ * Retrieve the head of the queue, if it matches the requested work
+ * unit type.
+ */
+int
+__wt_lsm_manager_pop_entry(
+ WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *entry;
+
+ manager = &S2C(session)->lsm_manager;
+ *entryp = NULL;
+ entry = NULL;
+
+ /*
+ * Pop the entry off the correct queue based on our work type.
+ */
+ if (type == WT_LSM_WORK_SWITCH)
+ LSM_POP_ENTRY(&manager->switchqh,
+ &manager->switch_lock, lsm_work_queue_switch);
+ else if (type == WT_LSM_WORK_MERGE)
+ LSM_POP_ENTRY(&manager->managerqh,
+ &manager->manager_lock, lsm_work_queue_manager);
+ else
+ LSM_POP_ENTRY(&manager->appqh,
+ &manager->app_lock, lsm_work_queue_app);
+ if (entry != NULL)
+ WT_STAT_FAST_CONN_INCR(session, lsm_work_units_done);
+ *entryp = entry;
+ return (0);
+}
+
+/*
+ * Push a work unit onto the appropriate queue. This macro assumes we are
+ * called from __wt_lsm_manager_push_entry and we have session and entry
+ * available for use.
+ */
+#define LSM_PUSH_ENTRY(qh, qlock, qlen) do { \
+ __wt_spin_lock(session, qlock); \
+ TAILQ_INSERT_TAIL((qh), entry, q); \
+ WT_STAT_FAST_CONN_INCR(session, qlen); \
+ __wt_spin_unlock(session, qlock); \
+} while (0)
+
+/*
+ * __wt_lsm_manager_push_entry --
+ * Add an entry to the end of the switch queue.
+ */
+int
+__wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
+ uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree)
+{
+ WT_LSM_MANAGER *manager;
+ WT_LSM_WORK_UNIT *entry;
+
+ manager = &S2C(session)->lsm_manager;
+
+ WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts));
+
+ WT_RET(__wt_calloc_def(session, 1, &entry));
+ entry->type = type;
+ entry->flags = flags;
+ entry->lsm_tree = lsm_tree;
+ (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1);
+ WT_STAT_FAST_CONN_INCR(session, lsm_work_units_created);
+
+ if (type == WT_LSM_WORK_SWITCH)
+ LSM_PUSH_ENTRY(&manager->switchqh,
+ &manager->switch_lock, lsm_work_queue_switch);
+ else if (type == WT_LSM_WORK_MERGE)
+ LSM_PUSH_ENTRY(&manager->managerqh,
+ &manager->manager_lock, lsm_work_queue_manager);
+ else
+ LSM_PUSH_ENTRY(&manager->appqh,
+ &manager->app_lock, lsm_work_queue_app);
+
+ WT_RET(__wt_cond_signal(session, manager->work_cond));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_merge.c b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
new file mode 100644
index 00000000000..784837092cd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_merge.c
@@ -0,0 +1,489 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_merge_update_tree --
+ * Merge a set of chunks and populate a new one.
+ * Must be called with the LSM lock held.
+ */
+int
+__wt_lsm_merge_update_tree(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, u_int start_chunk, u_int nchunks,
+ WT_LSM_CHUNK *chunk)
+{
+ size_t chunks_after_merge;
+ u_int i;
+
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+
+ /* Setup the array of obsolete chunks. */
+ WT_RET(__wt_realloc_def(session, &lsm_tree->old_alloc,
+ lsm_tree->nold_chunks + nchunks, &lsm_tree->old_chunks));
+
+ /* Copy entries one at a time, so we can reuse gaps in the list. */
+ for (i = 0; i < nchunks; i++)
+ lsm_tree->old_chunks[lsm_tree->nold_chunks++] =
+ lsm_tree->chunk[start_chunk + i];
+
+ /* Update the current chunk list. */
+ chunks_after_merge = lsm_tree->nchunks - (nchunks + start_chunk);
+ memmove(lsm_tree->chunk + start_chunk + 1,
+ lsm_tree->chunk + start_chunk + nchunks,
+ chunks_after_merge * sizeof(*lsm_tree->chunk));
+ lsm_tree->nchunks -= nchunks - 1;
+ memset(lsm_tree->chunk + lsm_tree->nchunks, 0,
+ (nchunks - 1) * sizeof(*lsm_tree->chunk));
+ lsm_tree->chunk[start_chunk] = chunk;
+
+ return (0);
+}
+
+/*
+ * __wt_lsm_merge --
+ * Merge a set of chunks of an LSM tree.
+ */
+int
+__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *dest, *src;
+ WT_DECL_ITEM(bbuf);
+ WT_DECL_RET;
+ WT_ITEM key, value;
+ WT_LSM_CHUNK *chunk, *previous, *youngest;
+ uint32_t aggressive, generation, max_gap, max_gen, max_level, start_id;
+ uint64_t insert_count, record_count, chunk_size;
+ u_int dest_id, end_chunk, i, merge_max, merge_min, nchunks, start_chunk;
+ u_int verb;
+ int create_bloom, locked, in_sync, tret;
+ const char *cfg[3];
+ const char *drop_cfg[] =
+ { WT_CONFIG_BASE(session, session_drop), "force", NULL };
+
+ bloom = NULL;
+ chunk_size = 0;
+ create_bloom = 0;
+ dest = src = NULL;
+ locked = 0;
+ start_id = 0;
+ in_sync = 0;
+
+ /*
+ * If the tree is open read-only or we are compacting, be very
+ * aggressive. Otherwise, we can spend a long time waiting for merges
+ * to start in read-only applications.
+ */
+ if (!lsm_tree->modified ||
+ F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+ lsm_tree->merge_aggressiveness = 10;
+
+ aggressive = lsm_tree->merge_aggressiveness;
+ merge_max = (aggressive > 5) ? 100 : lsm_tree->merge_min;
+ merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min;
+ max_gap = (aggressive + 4) / 5;
+ max_level = (lsm_tree->merge_throttle > 0) ? 0 : id + aggressive;
+
+ /*
+ * If there aren't any chunks to merge, or some of the chunks aren't
+ * yet written, we're done. A non-zero error indicates that the worker
+ * should assume there is no work to do: if there are unwritten chunks,
+ * the worker should write them immediately.
+ */
+ if (lsm_tree->nchunks < merge_min)
+ return (WT_NOTFOUND);
+
+ /*
+ * Use the lsm_tree lock to read the chunks (so no switches occur), but
+ * avoid holding it while the merge is in progress: that may take a
+ * long time.
+ */
+ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+
+ /*
+ * Only include chunks that already have a Bloom filter or are the
+ * result of a merge and not involved in a merge.
+ */
+ for (end_chunk = lsm_tree->nchunks - 1; end_chunk > 0; --end_chunk) {
+ chunk = lsm_tree->chunk[end_chunk];
+ WT_ASSERT(session, chunk != NULL);
+ if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING))
+ continue;
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0)
+ break;
+ else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ break;
+ }
+
+ /*
+ * Give up immediately if there aren't enough on disk chunks in the
+ * tree for a merge.
+ */
+ if (end_chunk < merge_min - 1) {
+ WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ return (WT_NOTFOUND);
+ }
+
+ /*
+ * Look for the most efficient merge we can do. We define efficiency
+ * as collapsing as many levels as possible while processing the
+ * smallest number of rows.
+ *
+ * We make a distinction between "major" and "minor" merges. The
+ * difference is whether the oldest chunk is involved: if it is, we can
+ * discard tombstones, because there can be no older record to marked
+ * deleted.
+ *
+ * Respect the configured limit on the number of chunks to merge: start
+ * with the most recent set of chunks and work backwards until going
+ * further becomes significantly less efficient.
+ */
+ for (start_chunk = end_chunk + 1, record_count = 0;
+ start_chunk > 0; ) {
+ chunk = lsm_tree->chunk[start_chunk - 1];
+ youngest = lsm_tree->chunk[end_chunk];
+ nchunks = (end_chunk + 1) - start_chunk;
+
+ /*
+ * If the chunk is already involved in a merge or a Bloom
+ * filter is being built for it, stop.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_MERGING) || chunk->bloom_busy)
+ break;
+
+ /*
+ * Look for small merges before trying a big one: some threads
+ * should stay in low levels until we get more aggressive.
+ */
+ if (chunk->generation > max_level)
+ break;
+
+ /*
+ * If the size of the chunks selected so far exceeds the
+ * configured maximum chunk size, stop. Keep going if we can
+ * slide the window further into the tree: we don't want to
+ * leave small chunks in the middle.
+ */
+ if ((chunk_size += chunk->size) > lsm_tree->chunk_max)
+ if (nchunks < merge_min ||
+ (chunk->generation > youngest->generation &&
+ chunk_size - youngest->size > lsm_tree->chunk_max))
+ break;
+
+ /*
+ * If we have enough chunks for a merge and the next chunk is
+ * in too high a generation, stop.
+ */
+ if (nchunks >= merge_min) {
+ previous = lsm_tree->chunk[start_chunk];
+ max_gen = youngest->generation + max_gap;
+ if (previous->generation <= max_gen &&
+ chunk->generation > max_gen)
+ break;
+ }
+
+ F_SET(chunk, WT_LSM_CHUNK_MERGING);
+ record_count += chunk->count;
+ --start_chunk;
+
+ /*
+ * If we have a full window, or the merge would be too big,
+ * remove the youngest chunk.
+ */
+ if (nchunks == merge_max ||
+ chunk_size > lsm_tree->chunk_max) {
+ WT_ASSERT(session,
+ F_ISSET(youngest, WT_LSM_CHUNK_MERGING));
+ F_CLR(youngest, WT_LSM_CHUNK_MERGING);
+ record_count -= youngest->count;
+ chunk_size -= youngest->size;
+ --end_chunk;
+ }
+ }
+
+ nchunks = (end_chunk + 1) - start_chunk;
+ WT_ASSERT(session, nchunks <= merge_max);
+
+ if (nchunks > 0) {
+ WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
+ for (i = 0; i < nchunks; i++) {
+ chunk = lsm_tree->chunk[start_chunk + i];
+ WT_ASSERT(session,
+ F_ISSET(chunk, WT_LSM_CHUNK_MERGING));
+ }
+
+ chunk = lsm_tree->chunk[start_chunk];
+ youngest = lsm_tree->chunk[end_chunk];
+ start_id = chunk->id;
+
+ /*
+ * Don't do merges that are too small or across too many
+ * generations.
+ */
+ if (nchunks < merge_min ||
+ chunk->generation > youngest->generation + max_gap) {
+ for (i = 0; i < nchunks; i++) {
+ chunk = lsm_tree->chunk[start_chunk + i];
+ WT_ASSERT(session,
+ F_ISSET(chunk, WT_LSM_CHUNK_MERGING));
+ F_CLR(chunk, WT_LSM_CHUNK_MERGING);
+ }
+ nchunks = 0;
+ }
+ }
+
+ /* Find the merge generation. */
+ for (generation = 0, i = 0; i < nchunks; i++)
+ generation = WT_MAX(generation,
+ lsm_tree->chunk[start_chunk + i]->generation + 1);
+
+ WT_RET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (nchunks == 0)
+ return (WT_NOTFOUND);
+
+ /* Allocate an ID for the merge. */
+ dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+
+ /*
+ * We only want to do the chunk loop if we're running with verbose,
+ * so we wrap these statements in the conditional. Avoid the loop
+ * in the normal path.
+ */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
+ ", generation %" PRIu32,
+ lsm_tree->name,
+ start_chunk, end_chunk, dest_id, record_count, generation));
+ for (verb = start_chunk; verb <= end_chunk; verb++)
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "%s: Chunk[%u] id %u",
+ lsm_tree->name, verb, lsm_tree->chunk[verb]->id));
+ }
+
+ WT_RET(__wt_calloc_def(session, 1, &chunk));
+ chunk->id = dest_id;
+
+ if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
+ (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
+ start_chunk > 0) && record_count > 0)
+ create_bloom = 1;
+
+ /*
+ * Special setup for the merge cursor:
+ * first, reset to open the dependent cursors;
+ * then restrict the cursor to a specific number of chunks;
+ * then set MERGE so the cursor doesn't track updates to the tree.
+ */
+ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
+ F_SET(src, WT_CURSTD_RAW);
+ WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+ WT_ERR(ret);
+ if (create_bloom) {
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+ WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
+ lsm_tree->bloom_config,
+ record_count, lsm_tree->bloom_bit_count,
+ lsm_tree->bloom_hash_count, &bloom));
+ }
+
+ /* Discard pages we read as soon as we're done with them. */
+ F_SET(session, WT_SESSION_NO_CACHE);
+
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = "bulk,raw,skip_sort_check";
+ cfg[2] = NULL;
+ WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
+
+#define LSM_MERGE_CHECK_INTERVAL 1000
+ for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+ if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ WT_ERR(EINTR);
+ /*
+ * Help out with switching chunks in case the
+ * checkpoint worker is busy.
+ */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_lsm_tree_switch(session, lsm_tree));
+ WT_ERR(ret);
+ }
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
+ ++lsm_tree->merge_progressing;
+ }
+
+ WT_ERR(src->get_key(src, &key));
+ dest->set_key(dest, &key);
+ WT_ERR(src->get_value(src, &value));
+ dest->set_value(dest, &value);
+ WT_ERR(dest->insert(dest));
+ if (create_bloom)
+ WT_ERR(__wt_bloom_insert(bloom, &key));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
+ ++lsm_tree->merge_progressing;
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
+ record_count, insert_count));
+
+ /*
+ * Closing and syncing the files can take a while. Set the
+ * merge_syncing field so that compact knows it is still in
+ * progress.
+ */
+ (void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1);
+ in_sync = 1;
+ /*
+ * We've successfully created the new chunk. Now install it. We need
+ * to ensure that the NO_CACHE flag is cleared and the bloom filter
+ * is closed (even if a step fails), so track errors but don't return
+ * until we've cleaned up.
+ */
+ WT_TRET(src->close(src));
+ WT_TRET(dest->close(dest));
+ src = dest = NULL;
+
+ F_CLR(session, WT_SESSION_NO_CACHE);
+
+ /*
+ * We're doing advisory reads to fault the new trees into cache.
+ * Don't block if the cache is full: our next unit of work may be to
+ * discard some trees to free space.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+
+ if (create_bloom) {
+ if (ret == 0)
+ WT_TRET(__wt_bloom_finalize(bloom));
+
+ /*
+ * Read in a key to make sure the Bloom filters btree handle is
+ * open before it becomes visible to application threads.
+ * Otherwise application threads will stall while it is opened
+ * and internal pages are read into cache.
+ */
+ if (ret == 0) {
+ WT_CLEAR(key);
+ WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
+ }
+
+ WT_TRET(__wt_bloom_close(bloom));
+ bloom = NULL;
+ }
+ WT_ERR(ret);
+
+ /*
+ * Open a handle on the new chunk before application threads attempt
+ * to access it, opening it pre-loads internal pages into the file
+ * system cache.
+ */
+ cfg[1] = "checkpoint=" WT_CHECKPOINT;
+ WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
+ WT_TRET(dest->close(dest));
+ dest = NULL;
+ ++lsm_tree->merge_progressing;
+ (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+ in_sync = 0;
+ WT_ERR_NOTFOUND_OK(ret);
+
+ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /*
+ * Check whether we raced with another merge, and adjust the chunk
+ * array offset as necessary.
+ */
+ if (start_chunk >= lsm_tree->nchunks ||
+ lsm_tree->chunk[start_chunk]->id != start_id)
+ for (start_chunk = 0;
+ start_chunk < lsm_tree->nchunks;
+ start_chunk++)
+ if (lsm_tree->chunk[start_chunk]->id == start_id)
+ break;
+
+ /*
+ * It is safe to error out here - since the update can only fail
+ * prior to making updates to the tree.
+ */
+ WT_ERR(__wt_lsm_merge_update_tree(
+ session, lsm_tree, start_chunk, nchunks, chunk));
+
+ if (create_bloom)
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ chunk->count = insert_count;
+ chunk->generation = generation;
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+
+ /*
+ * We have no current way of continuing if the metadata update fails,
+ * so we will panic in that case. Put some effort into cleaning up
+ * after ourselves here - so things have a chance of shutting down.
+ *
+ * Any errors that happened after the tree was locked are
+ * fatal - we can't guarantee the state of the tree.
+ */
+ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
+ WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");
+
+ lsm_tree->dsk_gen++;
+
+ /* Update the throttling while holding the tree lock. */
+ __wt_lsm_tree_throttle(session, lsm_tree, 1);
+
+ /* Schedule a pass to discard old chunks */
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_DROP, 0, lsm_tree));
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ if (in_sync)
+ (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+ if (src != NULL)
+ WT_TRET(src->close(src));
+ if (dest != NULL)
+ WT_TRET(dest->close(dest));
+ if (bloom != NULL)
+ WT_TRET(__wt_bloom_close(bloom));
+ __wt_scr_free(&bbuf);
+ if (ret != 0) {
+ /* Drop the newly-created files on error. */
+ WT_WITH_SCHEMA_LOCK(session,
+ tret = __wt_schema_drop(session, chunk->uri, drop_cfg));
+ WT_TRET(tret);
+ if (create_bloom) {
+ WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(
+ session, chunk->bloom_uri, drop_cfg));
+ WT_TRET(tret);
+ }
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+
+ if (ret == EINTR)
+ WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+ "Merge aborted due to close"));
+ else
+ WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+ "Merge failed with %s", wiredtiger_strerror(ret)));
+ }
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_meta.c b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
new file mode 100644
index 00000000000..fbb5a9958d5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_meta.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_lsm_meta_read --
+ * Read the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_CONFIG cparser, lparser;
+ WT_CONFIG_ITEM ck, cv, lk, lv;
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_NAMED_COLLATOR *ncoll;
+ const char *lsmconfig;
+ u_int nchunks;
+
+ chunk = NULL; /* -Wconditional-uninitialized */
+
+ WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
+ WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
+ while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
+ if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->key_format);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->key_format));
+ } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->value_format);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->value_format));
+ } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) {
+ if (cv.len == 0)
+ continue;
+ TAILQ_FOREACH(ncoll, &S2C(session)->collqh, q) {
+ if (WT_STRING_MATCH(
+ ncoll->name, cv.str, cv.len)) {
+ lsm_tree->collator = ncoll->collator;
+ break;
+ }
+ }
+ if (lsm_tree->collator == NULL)
+ WT_ERR_MSG(session, EINVAL,
+ "unknown collator '%.*s'",
+ (int)cv.len, cv.str);
+ WT_ERR(__wt_strndup(session,
+ cv.str, cv.len, &lsm_tree->collator_name));
+ } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->bloom_config);
+ /* Don't include the brackets. */
+ WT_ERR(__wt_strndup(session,
+ cv.str + 1, cv.len - 2, &lsm_tree->bloom_config));
+ } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
+ __wt_free(session, lsm_tree->file_config);
+ /* Don't include the brackets. */
+ WT_ERR(__wt_strndup(session,
+ cv.str + 1, cv.len - 2, &lsm_tree->file_config));
+ } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) {
+ if (cv.val)
+ F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+ else
+ F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+ } else if (WT_STRING_MATCH("bloom", ck.str, ck.len))
+ lsm_tree->bloom = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len))
+ lsm_tree->bloom_bit_count = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len))
+ lsm_tree->bloom_hash_count = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len))
+ lsm_tree->chunk_max = (uint64_t)cv.val;
+ else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len))
+ lsm_tree->chunk_size = (uint64_t)cv.val;
+ else if (WT_STRING_MATCH("merge_max", ck.str, ck.len))
+ lsm_tree->merge_max = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("merge_min", ck.str, ck.len))
+ lsm_tree->merge_min = (uint32_t)cv.val;
+ else if (WT_STRING_MATCH("last", ck.str, ck.len))
+ lsm_tree->last = (u_int)cv.val;
+ else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
+ WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+ for (nchunks = 0; (ret =
+ __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+ if (WT_STRING_MATCH("id", lk.str, lk.len)) {
+ WT_ERR(__wt_realloc_def(session,
+ &lsm_tree->chunk_alloc,
+ nchunks + 1, &lsm_tree->chunk));
+ WT_ERR(__wt_calloc_def(
+ session, 1, &chunk));
+ lsm_tree->chunk[nchunks++] = chunk;
+ chunk->id = (uint32_t)lv.val;
+ WT_ERR(__wt_lsm_tree_chunk_name(session,
+ lsm_tree, chunk->id, &chunk->uri));
+ F_SET(chunk,
+ WT_LSM_CHUNK_ONDISK |
+ WT_LSM_CHUNK_STABLE);
+ } else if (WT_STRING_MATCH(
+ "bloom", lk.str, lk.len)) {
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree,
+ chunk->id, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ continue;
+ } else if (WT_STRING_MATCH(
+ "chunk_size", lk.str, lk.len)) {
+ chunk->size = (uint64_t)lv.val;
+ continue;
+ } else if (WT_STRING_MATCH(
+ "count", lk.str, lk.len)) {
+ chunk->count = (uint64_t)lv.val;
+ continue;
+ } else if (WT_STRING_MATCH(
+ "generation", lk.str, lk.len)) {
+ chunk->generation = (uint32_t)lv.val;
+ continue;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ lsm_tree->nchunks = nchunks;
+ } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
+ WT_ERR(__wt_config_subinit(session, &lparser, &cv));
+ for (nchunks = 0; (ret =
+ __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
+ if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
+ WT_ERR(__wt_strndup(session,
+ lv.str, lv.len, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ continue;
+ }
+ WT_ERR(__wt_realloc_def(session,
+ &lsm_tree->old_alloc, nchunks + 1,
+ &lsm_tree->old_chunks));
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ lsm_tree->old_chunks[nchunks++] = chunk;
+ WT_ERR(__wt_strndup(session,
+ lk.str, lk.len, &chunk->uri));
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ lsm_tree->nold_chunks = nchunks;
+ /* Values included for backward compatibility */
+ } else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) {
+ } else
+ WT_ERR(__wt_illegal_value(session, "LSM metadata"));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /*
+ * If the default merge_min was not overridden, calculate it now. We
+ * do this here so that trees created before merge_min was added get a
+ * sane value.
+ */
+ if (lsm_tree->merge_min < 2)
+ lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);
+
+err: __wt_free(session, lsmconfig);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_meta_write --
+ * Write the metadata for an LSM tree.
+ */
+int
+__wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ u_int i;
+ int first;
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)",
+ lsm_tree->key_format, lsm_tree->value_format,
+ lsm_tree->bloom_config, lsm_tree->file_config));
+ if (lsm_tree->collator_name != NULL)
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",collator=%s", lsm_tree->collator_name));
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",last=%" PRIu32
+ ",chunk_max=%" PRIu64
+ ",chunk_size=%" PRIu64
+ ",auto_throttle=%" PRIu32
+ ",merge_max=%" PRIu32
+ ",merge_min=%" PRIu32
+ ",bloom=%" PRIu32
+ ",bloom_bit_count=%" PRIu32
+ ",bloom_hash_count=%" PRIu32,
+ lsm_tree->last, lsm_tree->chunk_max, lsm_tree->chunk_size,
+ F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0,
+ lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom,
+ lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count));
+ WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=["));
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ if (i > 0)
+ WT_ERR(__wt_buf_catfmt(session, buf, ","));
+ WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_buf_catfmt(session, buf, ",bloom"));
+ if (chunk->size != 0)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",chunk_size=%" PRIu64, chunk->size));
+ if (chunk->count != 0)
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",count=%" PRIu64, chunk->count));
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",generation=%" PRIu32, chunk->generation));
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+ WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=["));
+ first = 1;
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ chunk = lsm_tree->old_chunks[i];
+ WT_ASSERT(session, chunk != NULL);
+ if (first)
+ first = 0;
+ else
+ WT_ERR(__wt_buf_catfmt(session, buf, ","));
+ WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, ",bloom=\"%s\"", chunk->bloom_uri));
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, "]"));
+ ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
+ WT_ERR(ret);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
new file mode 100644
index 00000000000..dc7d17e7a2c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __lsm_stat_init --
+ * Initialize a LSM statistics structure.
+ */
+static int
+__lsm_stat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst)
+{
+ WT_CURSOR *stat_cursor;
+ WT_DECL_ITEM(uribuf);
+ WT_DECL_RET;
+ WT_DSRC_STATS *new, *stats;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int locked;
+ char config[64];
+ const char *cfg[] = {
+ WT_CONFIG_BASE(session, session_open_cursor), NULL, NULL };
+ const char *disk_cfg[] = {
+ WT_CONFIG_BASE(session, session_open_cursor),
+ "checkpoint=" WT_CHECKPOINT, NULL, NULL };
+
+ locked = 0;
+ WT_RET(__wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+ WT_ERR(__wt_scr_alloc(session, 0, &uribuf));
+
+ /* Propagate all, fast and/or clear to the cursors we open. */
+ if (!F_ISSET(cst, WT_CONN_STAT_NONE)) {
+ (void)snprintf(config, sizeof(config),
+ "statistics=(%s%s%s)",
+ F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "",
+ F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "",
+ !F_ISSET(cst, WT_CONN_STAT_ALL) &&
+ F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "");
+ cfg[1] = disk_cfg[1] = config;
+ }
+
+ /*
+ * Set the cursor to reference the data source statistics; we don't
+ * initialize it, instead we copy (rather than aggregate), the first
+ * chunk's statistics, which has the same effect.
+ */
+ stats = &cst->u.dsrc_stats;
+
+ /* Hold the LSM lock so that we can safely walk through the chunks. */
+ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+
+ /*
+ * For each chunk, aggregate its statistics, as well as any associated
+ * bloom filter statistics, into the total statistics.
+ */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+
+ /*
+ * Get the statistics for the chunk's underlying object.
+ *
+ * XXX kludge: we may have an empty chunk where no checkpoint
+ * was written. If so, try to open the ordinary handle on that
+ * chunk instead.
+ */
+ WT_ERR(__wt_buf_fmt(
+ session, uribuf, "statistics:%s", chunk->uri));
+ ret = __wt_curstat_open(session, uribuf->data,
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg,
+ &stat_cursor);
+ if (ret == WT_NOTFOUND &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ ret = __wt_curstat_open(
+ session, uribuf->data, cfg, &stat_cursor);
+ WT_ERR(ret);
+
+ /*
+ * The underlying statistics have now been initialized; fill in
+ * values from the chunk's information, then aggregate into the
+ * top-level.
+ */
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ WT_STAT_SET(new, lsm_generation_max, chunk->generation);
+
+ /*
+ * We want to aggregate the table's statistics. Get a base set
+ * of statistics from the first chunk, then aggregate statistics
+ * from each new chunk.
+ */
+ if (i == 0)
+ *stats = *new;
+ else
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ continue;
+
+ /* Maintain a count of bloom filters. */
+ WT_STAT_INCR(&lsm_tree->stats, bloom_count);
+
+ /* Get the bloom filter's underlying object. */
+ WT_ERR(__wt_buf_fmt(
+ session, uribuf, "statistics:%s", chunk->bloom_uri));
+ WT_ERR(__wt_curstat_open(
+ session, uribuf->data, cfg, &stat_cursor));
+
+ /*
+ * The underlying statistics have now been initialized; fill in
+ * values from the bloom filter's information, then aggregate
+ * into the top-level.
+ */
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ WT_STAT_SET(new,
+ bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8);
+ WT_STAT_SET(new, bloom_page_evict,
+ WT_STAT(new, cache_eviction_clean) +
+ WT_STAT(new, cache_eviction_dirty));
+ WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read));
+
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+ }
+
+ /* Set statistics that aren't aggregated directly into the cursor */
+ WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks);
+
+ /* Aggregate, and optionally clear, LSM-level specific information. */
+ __wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats);
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ __wt_stat_refresh_dsrc_stats(&lsm_tree->stats);
+
+ __wt_curstat_dsrc_final(cst);
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+ __wt_lsm_tree_release(session, lsm_tree);
+ __wt_scr_free(&uribuf);
+
+ return (ret);
+}
+
+/*
+ * __wt_curstat_lsm_init --
+ * Initialize the statistics for a LSM tree.
+ */
+int
+__wt_curstat_lsm_init(
+ WT_SESSION_IMPL *session, const char *uri, WT_CURSOR_STAT *cst)
+{
+ WT_DECL_RET;
+
+ WT_WITH_SCHEMA_LOCK(session, ret = __lsm_stat_init(session, uri, cst));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
new file mode 100644
index 00000000000..447a8eb60a6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -0,0 +1,1266 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
+static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, WT_LSM_TREE **);
+static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
+
+/*
+ * __lsm_tree_discard --
+ * Free an LSM tree structure.
+ */
+static int
+__lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ u_int i;
+
+ /* We may be destroying an lsm_tree before it was added. */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN))
+ TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q);
+
+ __wt_free(session, lsm_tree->name);
+ __wt_free(session, lsm_tree->config);
+ __wt_free(session, lsm_tree->key_format);
+ __wt_free(session, lsm_tree->value_format);
+ __wt_free(session, lsm_tree->collator_name);
+ __wt_free(session, lsm_tree->bloom_config);
+ __wt_free(session, lsm_tree->file_config);
+
+ WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock));
+
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ if ((chunk = lsm_tree->chunk[i]) == NULL)
+ continue;
+
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+ __wt_free(session, lsm_tree->chunk);
+
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ chunk = lsm_tree->old_chunks[i];
+ WT_ASSERT(session, chunk != NULL);
+
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, chunk);
+ }
+ __wt_free(session, lsm_tree->old_chunks);
+ __wt_free(session, lsm_tree);
+
+ return (ret);
+}
+
+/*
+ * __lsm_tree_close --
+ * Close an LSM tree structure.
+ */
+static int
+__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ int i;
+
+ /* Stop any active merges. */
+ F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE);
+
+ /*
+ * Wait for all LSM operations and work units that were in flight to
+ * finish.
+ */
+ for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) {
+ /*
+ * Remove any work units from the manager queues. Do this step
+ * repeatedly in case a work unit was in the process of being
+ * created when we cleared the active flag.
+ * !! Drop the schema lock whilst completing this step so that
+ * we don't block any operations that require the schema
+ * lock to complete. This is safe because any operation that
+ * is closing the tree should first have gotten exclusive
+ * access to the LSM tree via __wt_lsm_tree_get, so other
+ * schema level operations will return EBUSY, even though
+ * we're dropping the schema lock here.
+ */
+ if (i % 1000 == 0) {
+ WT_WITHOUT_SCHEMA_LOCK(session, ret =
+ __wt_lsm_manager_clear_tree(session, lsm_tree));
+ WT_RET(ret);
+ }
+ __wt_yield();
+ }
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_close_all --
+ * Close all LSM tree structures.
+ */
+int
+__wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ while ((lsm_tree = TAILQ_FIRST(&S2C(session)->lsmqh)) != NULL) {
+ /*
+ * Tree close assumes that we have a reference to the tree
+ * so it can tell when it's safe to do the close. We could
+ * got through tree get here, but short circuit instead. There
+ * is no need to decrement the reference count since destroy
+ * is unconditional.
+ */
+ (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+ WT_TRET(__lsm_tree_close(session, lsm_tree));
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+
+ return (ret);
+}
+
+/*
+ * __lsm_tree_set_name --
+ * Set or reset the name of an LSM tree
+ */
+static int
+__lsm_tree_set_name(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, const char *uri)
+{
+ if (lsm_tree->name != NULL)
+ __wt_free(session, lsm_tree->name);
+ WT_RET(__wt_strdup(session, uri, &lsm_tree->name));
+ lsm_tree->filename = lsm_tree->name + strlen("lsm:");
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_bloom_name --
+ * Get the URI of the Bloom filter for a given chunk.
+ */
+int
+__wt_lsm_tree_bloom_name(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(
+ session, tmp, "file:%s-%06" PRIu32 ".bf", lsm_tree->filename, id));
+ WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_chunk_name --
+ * Get the URI of the file for a given chunk.
+ */
+int
+__wt_lsm_tree_chunk_name(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, uint32_t id, const char **retp)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_buf_fmt(
+ session, tmp, "file:%s-%06" PRIu32 ".lsm", lsm_tree->filename, id));
+ WT_ERR(__wt_strndup(session, tmp->data, tmp->size, retp));
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_set_chunk_size --
+ * Set the size of the chunk. Should only be called for chunks that are
+ * on disk, or about to become on disk.
+ */
+int
+__wt_lsm_tree_set_chunk_size(
+ WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk)
+{
+ wt_off_t size;
+ const char *filename;
+
+ filename = chunk->uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_RET_MSG(session, EINVAL,
+ "Expected a 'file:' URI: %s", chunk->uri);
+ WT_RET(__wt_filesize_name(session, filename, &size));
+
+ chunk->size = (uint64_t)size;
+
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_setup_chunk --
+ * Initialize a chunk of an LSM tree.
+ */
+int
+__wt_lsm_tree_setup_chunk(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+{
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_drop), "force", NULL };
+ int exists;
+
+ WT_RET(__wt_epoch(session, &chunk->create_ts));
+
+ WT_RET(__wt_lsm_tree_chunk_name(
+ session, lsm_tree, chunk->id, &chunk->uri));
+
+ /*
+ * If the underlying file exists, drop the chunk first - there may be
+ * some content hanging over from an aborted merge or checkpoint.
+ *
+ * Don't do this for the very first chunk: we are called during
+ * WT_SESSION::create, and doing a drop inside there does interesting
+ * things with handle locks and metadata tracking. It can never have
+ * been the result of an interrupted merge, anyway.
+ */
+ if (chunk->id > 1) {
+ WT_RET(__wt_exist(
+ session, chunk->uri + strlen("file:"), &exists));
+ if (exists)
+ WT_RET(__wt_schema_drop(session, chunk->uri, cfg));
+ }
+ return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config));
+}
+
+/*
+ * __wt_lsm_tree_create --
+ * Create an LSM tree structure for the given name.
+ */
+int
+__wt_lsm_tree_create(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_create), config, NULL };
+ const char *tmpconfig;
+
+ /* If the tree is open, it already exists. */
+ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ /*
+ * If the tree has metadata, it already exists.
+ *
+ * !!!
+ * Use a local variable: we don't care what the existing configuration
+ * is, but we don't want to overwrite the real config.
+ */
+ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
+ __wt_free(session, tmpconfig);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+ if (WT_STRING_MATCH("r", cval.str, cval.len))
+ WT_RET_MSG(session, EINVAL,
+ "LSM trees cannot be configured as column stores");
+
+ WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+
+ WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->key_format));
+ WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->value_format));
+
+ WT_ERR(__wt_config_gets(session, cfg, "collator", &cval));
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->collator_name));
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
+ if (cval.val)
+ F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
+ else
+ F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
+ FLD_SET(lsm_tree->bloom,
+ (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
+ if (cval.val != 0)
+ FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);
+
+ if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
+ FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
+ WT_ERR_MSG(session, EINVAL,
+ "Bloom filters can only be created on newest and oldest "
+ "chunks if bloom filters are enabled");
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
+ if (cval.type == WT_CONFIG_ITEM_STRUCT) {
+ cval.str++;
+ cval.len -= 2;
+ }
+ WT_ERR(__wt_strndup(
+ session, cval.str, cval.len, &lsm_tree->bloom_config));
+
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
+ lsm_tree->bloom_bit_count = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
+ lsm_tree->bloom_hash_count = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
+ lsm_tree->chunk_max = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
+ lsm_tree->chunk_size = (uint64_t)cval.val;
+ if (lsm_tree->chunk_size > lsm_tree->chunk_max)
+ WT_ERR_MSG(session, EINVAL,
+ "Chunk size (chunk_size) must be smaller than or equal to "
+ "the maximum chunk size (chunk_max)");
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
+ lsm_tree->merge_max = (uint32_t)cval.val;
+ WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval));
+ lsm_tree->merge_min = (uint32_t)cval.val;
+ if (lsm_tree->merge_min > lsm_tree->merge_max)
+ WT_ERR_MSG(session, EINVAL,
+ "LSM merge_min must be less than or equal to merge_max");
+
+ /*
+ * Set up the config for each chunk.
+ *
+ * Make the memory_page_max double the chunk size, so application
+ * threads don't immediately try to force evict the chunk when the
+ * worker thread clears the NO_EVICTION flag.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
+ config, 2 * lsm_tree->chunk_max));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &lsm_tree->file_config));
+
+ /* Create the first chunk and flush the metadata. */
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+ /* Discard our partially populated handle. */
+ ret = __lsm_tree_discard(session, lsm_tree);
+ lsm_tree = NULL;
+
+ /*
+ * Open our new tree and add it to the handle cache. Don't discard on
+ * error: the returned handle is NULL on error, and the metadata
+ * tracking macros handle cleaning up on failure.
+ */
+ if (ret == 0)
+ ret = __lsm_tree_open(session, uri, &lsm_tree);
+ if (ret == 0)
+ __wt_lsm_tree_release(session, lsm_tree);
+
+ if (0) {
+err: WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __lsm_tree_open_check --
+ * Validate the configuration of an LSM tree.
+ */
+static int
+__lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_CONFIG_ITEM cval;
+ uint64_t maxleafpage, required;
+ const char *cfg[] = { WT_CONFIG_BASE(
+ session, session_create), lsm_tree->file_config, NULL };
+
+ WT_RET(__wt_config_gets(session, cfg, "leaf_page_max", &cval));
+ maxleafpage = (uint64_t)cval.val;
+
+ /*
+ * Three chunks, plus one page for each participant in up to three
+ * concurrent merges.
+ */
+ required = 3 * lsm_tree->chunk_size +
+ 3 * (lsm_tree->merge_max * maxleafpage);
+ if (S2C(session)->cache_size < required)
+ WT_RET_MSG(session, EINVAL,
+ "LSM cache size %" PRIu64 " (%" PRIu64 "MB) too small, "
+ "must be at least %" PRIu64 " (%" PRIu64 "MB)",
+ S2C(session)->cache_size,
+ S2C(session)->cache_size / WT_MEGABYTE,
+ required, required / WT_MEGABYTE);
+ return (0);
+}
+
+/*
+ * __lsm_tree_open --
+ * Open an LSM tree structure.
+ */
+static int
+__lsm_tree_open(
+ WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_TREE *lsm_tree;
+
+ conn = S2C(session);
+ lsm_tree = NULL;
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ /* Start the LSM manager thread if it isn't running. */
+ if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
+ WT_RET(__wt_lsm_manager_start(session));
+
+ /* Make sure no one beat us to it. */
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+ if (strcmp(uri, lsm_tree->name) == 0) {
+ *treep = lsm_tree;
+ return (0);
+ }
+
+ /* Try to open the tree. */
+ WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
+ WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree"));
+
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));
+
+ WT_ERR(__wt_lsm_meta_read(session, lsm_tree));
+
+ /*
+ * Sanity check the configuration. Do it now since this is the first
+ * time we have the LSM tree configuration.
+ */
+ WT_ERR(__lsm_tree_open_check(session, lsm_tree));
+
+ if (lsm_tree->nchunks == 0) {
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
+ }
+
+ /* Set the generation number so cursors are opened on first usage. */
+ lsm_tree->dsk_gen = 1;
+
+ /*
+ * Setup reference counting. Use separate reference counts for tree
+ * handles and queue entries, so that queue entries don't interfere
+ * with getting handles exclusive.
+ */
+ lsm_tree->refcnt = 1;
+ lsm_tree->queue_ref = 0;
+
+ /* Set a flush timestamp as a baseline. */
+ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));
+
+ /* Now the tree is setup, make it visible to others. */
+ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
+ F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);
+
+ *treep = lsm_tree;
+
+ if (0) {
+err: WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_get --
+ * Get an LSM tree structure for the given name. Optionally get exclusive
+ * access to the handle. Exclusive access works separately to the LSM
+ * tree lock - since operations that need exclusive access may also need
+ * to take the LSM tree lock for example outstanding work unit operations.
+ */
+int
+__wt_lsm_tree_get(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, WT_LSM_TREE **treep)
+{
+ WT_LSM_TREE *lsm_tree;
+
+ /* See if the tree is already open. */
+ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
+ if (strcmp(uri, lsm_tree->name) == 0) {
+ /*
+ * Short circuit if the handle is already held
+ * exclusively or exclusive access is requested and
+ * there are references held.
+ */
+ if ((exclusive && lsm_tree->refcnt > 0) ||
+ F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE))
+ return (EBUSY);
+
+ if (exclusive) {
+ F_SET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+ if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) {
+ F_CLR(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+ return (EBUSY);
+ }
+ } else
+ (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+
+ /*
+ * If we got a reference, but an exclusive reference
+ * beat us to it, give our reference up.
+ */
+ if (!exclusive &&
+ F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) {
+ (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+ return (EBUSY);
+ }
+ *treep = lsm_tree;
+ return (0);
+ }
+
+ /* Open a new tree. */
+ return (__lsm_tree_open(session, uri, treep));
+}
+
+/*
+ * __wt_lsm_tree_release --
+ * Release an LSM tree structure.
+ */
+void
+__wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_ASSERT(session, lsm_tree->refcnt > 0);
+ (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+ F_CLR_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE);
+}
+
+/* How aggressively to ramp up or down throttle due to level 0 merging */
+#define WT_LSM_MERGE_THROTTLE_BUMP_PCT (100 / lsm_tree->merge_max)
+/* Number of level 0 chunks that need to be present to throttle inserts */
+#define WT_LSM_MERGE_THROTTLE_THRESHOLD \
+ (2 * lsm_tree->merge_min)
+/* Minimal throttling time */
+#define WT_LSM_THROTTLE_START 20
+
+#define WT_LSM_MERGE_THROTTLE_INCREASE(val) do { \
+ (val) += ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
+ if ((val) < WT_LSM_THROTTLE_START) \
+ (val) = WT_LSM_THROTTLE_START; \
+ } while (0)
+
+#define WT_LSM_MERGE_THROTTLE_DECREASE(val) do { \
+ (val) -= ((val) * WT_LSM_MERGE_THROTTLE_BUMP_PCT) / 100; \
+ if ((val) < WT_LSM_THROTTLE_START) \
+ (val) = 0; \
+ } while (0)
+
+/*
+ * __wt_lsm_tree_throttle --
+ * Calculate whether LSM updates need to be throttled. Must be called
+ * with the LSM tree lock held.
+ */
+void
+__wt_lsm_tree_throttle(
+ WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
+{
+ WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
+ uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
+ uint32_t in_memory, gen0_chunks;
+
+ /* Never throttle in small trees. */
+ if (lsm_tree->nchunks < 3) {
+ lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
+ return;
+ }
+
+ cache_sz = S2C(session)->cache_size;
+
+ /*
+ * In the steady state, we expect that the checkpoint worker thread
+ * will keep up with inserts. If not, throttle the insert rate to
+ * avoid filling the cache with in-memory chunks. Threads sleep every
+ * 100 operations, so take that into account in the calculation.
+ *
+ * Also throttle based on whether merge threads are keeping up. If
+ * there are enough chunks that have never been merged we slow down
+ * inserts so that merges have some chance of keeping up.
+ *
+ * Count the number of in-memory chunks, the number of unmerged chunk
+ * on disk, and find the most recent on-disk chunk (if any).
+ */
+ record_count = 1;
+ gen0_chunks = in_memory = 0;
+ ondisk = NULL;
+ for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
+ cp >= lsm_tree->chunk;
+ --cp)
+ if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
+ record_count += (*cp)->count;
+ ++in_memory;
+ } else {
+ /*
+ * Assign ondisk to the last chunk that has been
+ * flushed since the tree was last opened (i.e it's on
+ * disk and stable is not set).
+ */
+ if (ondisk == NULL &&
+ ((*cp)->generation == 0 &&
+ !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
+ ondisk = *cp;
+
+ if ((*cp)->generation == 0 &&
+ !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
+ ++gen0_chunks;
+ }
+
+ last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
+
+ /* Checkpoint throttling, based on the number of in-memory chunks. */
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
+ lsm_tree->ckpt_throttle = 0;
+ else if (decrease_only)
+ ; /* Nothing to do */
+ else if (ondisk == NULL) {
+ /*
+ * No checkpoint has completed this run. Keep slowing down
+ * inserts until one does.
+ */
+ lsm_tree->ckpt_throttle =
+ WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
+ } else {
+ WT_ASSERT(session,
+ WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
+ timediff =
+ WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
+ lsm_tree->ckpt_throttle =
+ (long)((in_memory - 2) * timediff / (20 * record_count));
+
+ /*
+ * Get more aggressive as the number of in memory chunks
+ * consumes a large proportion of the cache. In memory chunks
+ * are allowed to grow up to twice as large as the configured
+ * value when checkpoints aren't keeping up. That worst case
+ * is when this calculation is relevant.
+ * There is nothing particularly special about the chosen
+ * multipliers.
+ */
+ cache_used = in_memory * lsm_tree->chunk_size * 2;
+ if (cache_used > cache_sz * 0.8)
+ lsm_tree->ckpt_throttle *= 5;
+ }
+
+ /*
+ * Merge throttling, based on the number of on-disk, level 0 chunks.
+ *
+ * Don't throttle if the tree has less than a single level's number
+ * of chunks.
+ */
+ if (lsm_tree->nchunks < lsm_tree->merge_max)
+ lsm_tree->merge_throttle = 0;
+ else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
+ WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
+ else if (!decrease_only)
+ WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);
+
+ /* Put an upper bound of 1s on both throttle calculations. */
+ lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
+ lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);
+
+ /*
+ * Update our estimate of how long each in-memory chunk stays active.
+ * Filter out some noise by keeping a weighted history of the
+ * calculated value. Wait until we have enough chunks that we can
+ * check that the new value is sane: otherwise, after a long idle
+ * period, we can calculate a crazy value.
+ */
+ if (in_memory > 1 && ondisk != NULL) {
+ prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
+ WT_ASSERT(session, prev_chunk->generation == 0);
+ WT_ASSERT(session, WT_TIMECMP(
+ last_chunk->create_ts, prev_chunk->create_ts) >= 0);
+ timediff =
+ WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
+ WT_ASSERT(session,
+ WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
+ oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
+ if (timediff < 10 * oldtime)
+ lsm_tree->chunk_fill_ms =
+ (3 * lsm_tree->chunk_fill_ms +
+ timediff / 1000000) / 4;
+ }
+}
+
+/*
+ * __wt_lsm_tree_switch --
+ * Switch to a new in-memory tree.
+ */
+int
+__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ uint32_t nchunks, new_id;
+ int first_switch;
+
+ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+
+ nchunks = lsm_tree->nchunks;
+
+ first_switch = nchunks == 0 ? 1 : 0;
+ /*
+ * Check if a switch is still needed: we may have raced while waiting
+ * for a lock.
+ */
+ chunk = NULL;
+ if (!first_switch &&
+ (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
+ !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
+ goto err;
+
+ /* Set the switch transaction in the previous chunk, if necessary. */
+ if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE)
+ chunk->switch_txn = __wt_txn_new_id(session);
+
+ /* Update the throttle time. */
+ __wt_lsm_tree_throttle(session, lsm_tree, 0);
+
+ new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+
+ WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
+ nchunks + 1, &lsm_tree->chunk));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, "
+ "merge throttle %ld", lsm_tree->name,
+ new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle));
+
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ chunk->id = new_id;
+ chunk->switch_txn = WT_TXN_NONE;
+ lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
+ WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+ F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
+ ++lsm_tree->dsk_gen;
+
+ lsm_tree->modified = 1;
+
+err: WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ /*
+ * Errors that happen during a tree switch leave the tree in a state
+ * where we can't make progress. Error out of WiredTiger.
+ */
+ if (ret != 0)
+ WT_PANIC_RET(session, ret, "Failed doing LSM switch");
+ else if (!first_switch)
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_drop --
+ * Drop an LSM tree.
+ */
+int
+__wt_lsm_tree_drop(
+ WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int locked;
+
+ locked = 0;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Drop the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(
+ __wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ /* Drop any chunks on the obsolete list. */
+ for (i = 0; i < lsm_tree->nold_chunks; i++) {
+ if ((chunk = lsm_tree->old_chunks[i]) == NULL)
+ continue;
+ WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(
+ __wt_schema_drop(session, chunk->bloom_uri, cfg));
+ }
+
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ ret = __wt_metadata_remove(session, name);
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_rename --
+ * Rename an LSM tree.
+ */
+int
+__wt_lsm_tree_rename(WT_SESSION_IMPL *session,
+ const char *olduri, const char *newuri, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ const char *old;
+ u_int i;
+ int locked;
+
+ old = NULL;
+ locked = 0;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Set the new name. */
+ WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));
+
+ /* Rename the chunks. */
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ old = chunk->uri;
+ chunk->uri = NULL;
+
+ WT_ERR(__wt_lsm_tree_chunk_name(
+ session, lsm_tree, chunk->id, &chunk->uri));
+ WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ old = chunk->bloom_uri;
+ chunk->bloom_uri = NULL;
+ WT_ERR(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, chunk->id, &chunk->bloom_uri));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ WT_ERR(__wt_schema_rename(
+ session, old, chunk->uri, cfg));
+ __wt_free(session, old);
+ }
+ }
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ WT_ERR(__wt_metadata_remove(session, olduri));
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ if (old != NULL)
+ __wt_free(session, old);
+ /*
+ * Discard this LSM tree structure. The first operation on the renamed
+ * tree will create a new one.
+ */
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_truncate --
+ * Truncate an LSM tree.
+ */
+int
+__wt_lsm_tree_truncate(
+ WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ int locked;
+
+ WT_UNUSED(cfg);
+ chunk = NULL;
+ locked = 0;
+
+ /* Get the LSM tree. */
+ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));
+
+ /* Shut down the LSM worker. */
+ WT_ERR(__lsm_tree_close(session, lsm_tree));
+
+ /* Prevent any new opens. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Create the new chunk. */
+ WT_ERR(__wt_calloc_def(session, 1, &chunk));
+ chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+ WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
+
+ /* Mark all chunks old. */
+ WT_ERR(__wt_lsm_merge_update_tree(
+ session, lsm_tree, 0, lsm_tree->nchunks, chunk));
+
+ WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
+
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ __wt_lsm_tree_release(session, lsm_tree);
+
+err: if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ if (ret != 0) {
+ if (chunk != NULL) {
+ (void)__wt_schema_drop(session, chunk->uri, NULL);
+ __wt_free(session, chunk);
+ }
+ /*
+ * Discard the LSM tree structure on error. This will force the
+ * LSM tree to be re-opened the next time it is accessed and
+ * the last good version of the metadata will be used, resulting
+ * in a valid (not truncated) tree.
+ */
+ WT_TRET(__lsm_tree_discard(session, lsm_tree));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_lsm_tree_readlock --
+ * Acquire a shared lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_RET(__wt_readlock(session, lsm_tree->rwlock));
+
+ /*
+ * Diagnostic: avoid deadlocks with the schema lock: if we need it for
+ * an operation, we should already have it.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_readunlock --
+ * Release a shared lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+
+ F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+
+ if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
+ WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_writelock --
+ * Acquire an exclusive lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_RET(__wt_writelock(session, lsm_tree->rwlock));
+
+ /*
+ * Diagnostic: avoid deadlocks with the schema lock: if we need it for
+ * an operation, we should already have it.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ return (0);
+}
+
+/*
+ * __wt_lsm_tree_writeunlock --
+ * Release an exclusive lock on an LSM tree.
+ */
+int
+__wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+
+ F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+
+ if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0)
+ WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
+ return (0);
+}
+
+/*
+ * __wt_lsm_compact --
+ * Compact an LSM tree called via __wt_schema_worker.
+ */
+int
+__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ time_t begin, end;
+ uint64_t progress;
+ int i, compacting, flushing, locked, ref;
+
+ compacting = flushing = locked = ref = 0;
+ chunk = NULL;
+ /*
+ * This function is applied to all matching sources: ignore anything
+ * that is not an LSM tree.
+ */
+ if (!WT_PREFIX_MATCH(name, "lsm:"))
+ return (0);
+
+ /* Tell __wt_schema_worker not to look inside the LSM tree. */
+ *skip = 1;
+
+ WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));
+
+ if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
+ WT_ERR_MSG(session, EINVAL,
+ "LSM compaction requires active merge threads");
+
+ WT_ERR(__wt_seconds(session, &begin));
+
+ /*
+ * Compacting has two distinct phases.
+ * 1. All in-memory chunks up to and including the current
+ * current chunk must be flushed. Normally, the flush code
+ * does not flush the last, in-use chunk, so we set a force
+ * flag to include that last chunk. We monitor the state of the
+ * last chunk and periodically push another forced flush work
+ * unit until it is complete.
+ * 2. After all flushing is done, we move onto the merging
+ * phase for compaction. Again, we monitor the state and
+ * continue to push merge work units until all merging is done.
+ */
+
+ /* Lock the tree: single-thread compaction. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ locked = 1;
+
+ /* Clear any merge throttle: compact throws out that calculation. */
+ lsm_tree->merge_throttle = 0;
+ lsm_tree->merge_aggressiveness = 0;
+ progress = lsm_tree->merge_progressing;
+
+ /* If another thread started a compact on this tree, we're done. */
+ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
+ goto err;
+
+ /*
+ * Set the switch transaction on the current chunk, if it
+ * hasn't been set before. This prevents further writes, so it
+ * can be flushed by the checkpoint worker.
+ */
+ if (lsm_tree->nchunks > 0 &&
+ (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
+ if (chunk->switch_txn == WT_TXN_NONE)
+ chunk->switch_txn = __wt_txn_new_id(session);
+ /*
+ * If we have a chunk, we want to look for it to be on-disk.
+ * So we need to add a reference to keep it available.
+ */
+ (void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+ ref = 1;
+ }
+
+ locked = 0;
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (chunk != NULL) {
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Compact force flush %s flags 0x%" PRIx32
+ " chunk %u flags 0x%"
+ PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
+ flushing = 1;
+ /*
+ * Make sure the in-memory chunk gets flushed do not push a
+ * switch, because we don't want to create a new in-memory
+ * chunk if the tree is being used read-only now.
+ */
+ WT_ERR(__wt_lsm_manager_push_entry(session,
+ WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
+ } else {
+ /*
+ * If there is no chunk to flush, go straight to the
+ * compacting state.
+ */
+ compacting = 1;
+ progress = lsm_tree->merge_progressing;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "COMPACT: Start compacting %s", lsm_tree->name));
+ }
+
+ /* Wait for the work unit queues to drain. */
+ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
+ /*
+ * The flush flag is cleared when the chunk has been flushed.
+ * Continue to push forced flushes until the chunk is on disk.
+ * Once it is on disk move to the compacting phase.
+ */
+ if (flushing) {
+ WT_ASSERT(session, chunk != NULL);
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ WT_ERR(__wt_verbose(session,
+ WT_VERB_LSM,
+ "Compact flush done %s chunk %u. "
+ "Start compacting progress %" PRIu64,
+ name, chunk->id,
+ lsm_tree->merge_progressing));
+ (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ flushing = ref = 0;
+ compacting = 1;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ progress = lsm_tree->merge_progressing;
+ } else {
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Compact flush retry %s chunk %u",
+ name, chunk->id));
+ WT_ERR(__wt_lsm_manager_push_entry(session,
+ WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
+ lsm_tree));
+ }
+ }
+
+ /*
+ * The compacting flag is cleared when no merges can be done.
+ * Ensure that we push through some aggressive merges before
+ * stopping otherwise we might not do merges that would
+ * span chunks with different generations.
+ */
+ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
+ if (lsm_tree->merge_aggressiveness < 10 ||
+ (progress < lsm_tree->merge_progressing) ||
+ lsm_tree->merge_syncing) {
+ progress = lsm_tree->merge_progressing;
+ F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
+ lsm_tree->merge_aggressiveness = 10;
+ } else
+ break;
+ }
+ __wt_sleep(1, 0);
+ WT_ERR(__wt_seconds(session, &end));
+ if (session->compact->max_time > 0 &&
+ session->compact->max_time < (uint64_t)(end - begin)) {
+ WT_ERR(ETIMEDOUT);
+ }
+ /*
+ * Push merge operations while they are still getting work
+ * done. If we are pushing merges, make sure they are
+ * aggressive, to avoid duplicating effort.
+ */
+ if (compacting)
+#define COMPACT_PARALLEL_MERGES 5
+ for (i = lsm_tree->queue_ref;
+ i < COMPACT_PARALLEL_MERGES; i++) {
+ lsm_tree->merge_aggressiveness = 10;
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ }
+ }
+err:
+ /* Ensure anything we set is cleared. */
+ if (ref)
+ (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ if (compacting) {
+ F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
+ lsm_tree->merge_aggressiveness = 0;
+ }
+ if (locked)
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ WT_TRET(__wt_verbose(session, WT_VERB_LSM,
+ "Compact %s complete, return %d", name, ret));
+
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (ret);
+
+}
+
+/*
+ * __wt_lsm_tree_worker --
+ * Run a schema worker operation on each level of a LSM tree.
+ */
+int
+__wt_lsm_tree_worker(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+ int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
+ const char *cfg[], uint32_t open_flags)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_TREE *lsm_tree;
+ u_int i;
+ int exclusive, locked;
+
+ locked = 0;
+ exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0;
+ WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree));
+
+ /*
+ * We mark that we're busy using the tree to coordinate
+ * with merges so that merging doesn't change the chunk
+ * array out from underneath us.
+ */
+ WT_ERR(exclusive ?
+ __wt_lsm_tree_writelock(session, lsm_tree) :
+ __wt_lsm_tree_readlock(session, lsm_tree));
+ locked = 1;
+ for (i = 0; i < lsm_tree->nchunks; i++) {
+ chunk = lsm_tree->chunk[i];
+ if (file_func == __wt_checkpoint &&
+ F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ continue;
+ WT_ERR(__wt_schema_worker(session, chunk->uri,
+ file_func, name_func, cfg, open_flags));
+ if (name_func == __wt_backup_list_uri_append &&
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
+ WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
+ file_func, name_func, cfg, open_flags));
+ }
+err: if (locked)
+ WT_TRET(exclusive ?
+ __wt_lsm_tree_writeunlock(session, lsm_tree) :
+ __wt_lsm_tree_readunlock(session, lsm_tree));
+ __wt_lsm_tree_release(session, lsm_tree);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
new file mode 100644
index 00000000000..278c400070f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_bloom_create(
+ WT_SESSION_IMPL *, WT_LSM_TREE *, WT_LSM_CHUNK *, u_int);
+static int __lsm_discard_handle(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __lsm_copy_chunks --
+ * Take a copy of part of the LSM tree chunk array so that we can work on
+ * the contents without holding the LSM tree handle lock long term.
+ */
+static int
+__lsm_copy_chunks(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks)
+{
+ WT_DECL_RET;
+ u_int i, nchunks;
+ size_t alloc;
+
+ /* Always return zero chunks on error. */
+ cookie->nchunks = 0;
+
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ return (__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /* Take a copy of the current state of the LSM tree. */
+ nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks;
+ alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc;
+
+ /*
+ * If the tree array of active chunks is larger than our current buffer,
+ * increase the size of our current buffer to match.
+ */
+ if (cookie->chunk_alloc < alloc)
+ WT_ERR(__wt_realloc(session,
+ &cookie->chunk_alloc, alloc, &cookie->chunk_array));
+ if (nchunks > 0)
+ memcpy(cookie->chunk_array,
+ old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk,
+ nchunks * sizeof(*cookie->chunk_array));
+
+ /*
+ * Mark each chunk as active, so we don't drop it until after we know
+ * it's safe.
+ */
+ for (i = 0; i < nchunks; i++)
+ (void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1);
+
+err: WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ if (ret == 0)
+ cookie->nchunks = nchunks;
+ return (ret);
+}
+
+/*
+ * __wt_lsm_get_chunk_to_flush --
+ * Find and pin a chunk in the LSM tree that is likely to need flushing.
+ */
+int
+__wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp)
+{
+ u_int i, end;
+
+ *chunkp = NULL;
+
+ WT_ASSERT(session, lsm_tree->queue_ref > 0);
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
+ return (__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /*
+ * Normally we don't want to force out the last chunk. But if we're
+ * doing a forced flush, likely from a compact call, then we want
+ * to include the final chunk.
+ */
+ end = force ? lsm_tree->nchunks : lsm_tree->nchunks - 1;
+ for (i = 0; i < end; i++) {
+ if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK)) {
+ (void)WT_ATOMIC_ADD4(lsm_tree->chunk[i]->refcnt, 1);
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "Flush%s: return chunk %u of %u: %s",
+ force ? " w/ force" : "", i, end - 1,
+ lsm_tree->chunk[i]->uri));
+ *chunkp = lsm_tree->chunk[i];
+ break;
+ }
+ }
+
+ WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ return (0);
+}
+
+/*
+ * __lsm_unpin_chunks --
+ * Decrement the reference count for a set of chunks. Allowing those
+ * chunks to be considered for deletion.
+ */
+static void
+__lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie)
+{
+ u_int i;
+
+ for (i = 0; i < cookie->nchunks; i++) {
+ if (cookie->chunk_array[i] == NULL)
+ continue;
+ WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0);
+ (void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1);
+ }
+ /* Ensure subsequent calls don't double decrement. */
+ cookie->nchunks = 0;
+}
+
+/*
+ * __wt_lsm_work_switch --
+ * Do a switch if the LSM tree needs one.
+ */
+int
+__wt_lsm_work_switch(
+ WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran)
+{
+ WT_DECL_RET;
+ WT_LSM_WORK_UNIT *entry;
+
+ /* We've become responsible for freeing the work unit. */
+ entry = *entryp;
+ *ran = 0;
+ *entryp = NULL;
+
+ if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_lsm_tree_switch(session, entry->lsm_tree));
+ /* Failing to complete the switch is fine */
+ if (ret == EBUSY) {
+ if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH))
+ WT_ERR(__wt_lsm_manager_push_entry(session,
+ WT_LSM_WORK_SWITCH, 0, entry->lsm_tree));
+ ret = 0;
+ } else
+ *ran = 1;
+ }
+err: __wt_lsm_manager_free_work_unit(session, entry);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_work_bloom --
+ * Try to create a Bloom filter for the newest on-disk chunk that doesn't
+ * have one.
+ */
+int
+__wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORKER_COOKIE cookie;
+ u_int i, merge;
+
+ WT_CLEAR(cookie);
+
+ WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));
+
+ /* Create bloom filters in all checkpointed chunks. */
+ merge = 0;
+ for (i = 0; i < cookie.nchunks; i++) {
+ chunk = cookie.chunk_array[i];
+
+ /*
+ * Skip if a thread is still active in the chunk or it
+ * isn't suitable.
+ */
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
+ F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) ||
+ chunk->generation > 0 ||
+ chunk->count == 0)
+ continue;
+
+ /*
+ * See if we win the race to switch on the "busy" flag and
+ * recheck that the chunk still needs a Bloom filter.
+ */
+ if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) {
+ if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ ret = __lsm_bloom_create(
+ session, lsm_tree, chunk, (u_int)i);
+ /*
+ * Record if we were successful so that we can
+ * later push a merge work unit.
+ */
+ if (ret == 0)
+ merge = 1;
+ }
+ chunk->bloom_busy = 0;
+ break;
+ }
+ }
+ /*
+ * If we created any bloom filters, we push a merge work unit now.
+ */
+ if (merge)
+ WT_ERR(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+
+err:
+ __lsm_unpin_chunks(session, &cookie);
+ __wt_free(session, cookie.chunk_array);
+ return (ret);
+}
+
+/*
+ * __wt_lsm_checkpoint_chunk --
+ * Flush a single LSM chunk to disk.
+ */
+int
+__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
+{
+ WT_DECL_RET;
+ WT_TXN_ISOLATION saved_isolation;
+
+ /*
+ * If the chunk is already checkpointed, make sure it is also evicted.
+ * Either way, there is no point trying to checkpoint it again.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
+ !chunk->evicted) {
+ if ((ret = __lsm_discard_handle(
+ session, chunk->uri, NULL)) == 0)
+ chunk->evicted = 1;
+ else if (ret == EBUSY)
+ ret = 0;
+ else
+ WT_RET_MSG(session, ret, "discard handle");
+ }
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker %s already on disk",
+ chunk->uri));
+ return (0);
+ }
+
+ /* Stop if a running transaction needs the chunk. */
+ __wt_txn_update_oldest(session);
+ if (chunk->switch_txn == WT_TXN_NONE ||
+ !__wt_txn_visible_all(session, chunk->switch_txn)) {
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker %s: running transaction, return",
+ chunk->uri));
+ return (0);
+ }
+
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
+ chunk->uri));
+
+ /*
+ * Flush the file before checkpointing: this is the expensive part in
+ * terms of I/O.
+ *
+ * Use the special eviction isolation level to avoid interfering with
+ * an application checkpoint: we have already checked that all of the
+ * updates in this chunk are globally visible.
+ *
+ * !!! We can wait here for checkpoints and fsyncs to complete, which
+ * can be a long time.
+ */
+ if ((ret = __wt_session_get_btree(
+ session, chunk->uri, NULL, NULL, 0)) == 0) {
+ saved_isolation = session->txn.isolation;
+ session->txn.isolation = TXN_ISO_EVICTION;
+ ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
+ session->txn.isolation = saved_isolation;
+ WT_TRET(__wt_session_release_btree(session));
+ }
+ WT_RET(ret);
+
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
+ chunk->uri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, chunk->uri,
+ __wt_checkpoint, NULL, NULL, 0));
+
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "LSM checkpoint");
+
+ /* Now the file is written, get the chunk size. */
+ WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk));
+
+ /* Update the flush timestamp to help track ongoing progress. */
+ WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts));
+
+ /* Lock the tree, mark the chunk as on disk and update the metadata. */
+ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
+ ret = __wt_lsm_meta_write(session, lsm_tree);
+ ++lsm_tree->dsk_gen;
+
+ /* Update the throttle time. */
+ __wt_lsm_tree_throttle(session, lsm_tree, 1);
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "LSM metadata write");
+
+ /*
+ * Clear the "cache resident" flag so the primary can be evicted and
+ * eventually closed. Only do this once the checkpoint has succeeded:
+ * otherwise, accessing the leaf page during the checkpoint can trigger
+ * forced eviction.
+ */
+ WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
+ __wt_btree_evictable(session, 1);
+ WT_RET(__wt_session_release_btree(session));
+
+ /* Make sure we aren't pinning a transaction ID. */
+ __wt_txn_release_snapshot(session);
+
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
+ chunk->uri));
+ /*
+ * Schedule a bloom filter create for our newly flushed chunk */
+ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
+ else
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_MERGE, 0, lsm_tree));
+ return (0);
+}
+
+/*
+ * __lsm_bloom_create --
+ * Create a bloom filter for a chunk of the LSM tree that has been
+ * checkpointed but not yet been merged.
+ */
+static int
+__lsm_bloom_create(WT_SESSION_IMPL *session,
+ WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
+{
+ WT_BLOOM *bloom;
+ WT_CURSOR *src;
+ WT_DECL_RET;
+ WT_ITEM key;
+ WT_SESSION *wt_session;
+ uint64_t insert_count;
+ int exist;
+
+ /*
+ * Normally, the Bloom URI is populated when the chunk struct is
+ * allocated. After an open, however, it may not have been.
+ * Deal with that here.
+ */
+ if (chunk->bloom_uri == NULL)
+ WT_RET(__wt_lsm_tree_bloom_name(
+ session, lsm_tree, chunk->id, &chunk->bloom_uri));
+
+ /*
+ * Drop the bloom filter first - there may be some content hanging over
+ * from an aborted merge or checkpoint.
+ */
+ wt_session = &session->iface;
+ WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
+ if (exist)
+ WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));
+
+ bloom = NULL;
+ /*
+ * This is merge-like activity, and we don't want compacts to give up
+ * because we are creating a bunch of bloom filters before merging.
+ */
+ ++lsm_tree->merge_progressing;
+ WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
+ lsm_tree->bloom_config, chunk->count,
+ lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));
+
+ /* Open a special merge cursor just on this chunk. */
+ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
+ F_SET(src, WT_CURSTD_RAW);
+ WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));
+
+ F_SET(session, WT_SESSION_NO_CACHE);
+ for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
+ WT_ERR(src->get_key(src, &key));
+ WT_ERR(__wt_bloom_insert(bloom, &key));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ WT_TRET(src->close(src));
+
+ WT_TRET(__wt_bloom_finalize(bloom));
+ WT_ERR(ret);
+
+ F_CLR(session, WT_SESSION_NO_CACHE);
+
+ /*
+ * Load the new Bloom filter into cache.
+ *
+ * We're doing advisory reads to fault the new trees into cache.
+ * Don't block if the cache is full: our next unit of work may be to
+ * discard some trees to free space.
+ */
+ F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+
+ WT_CLEAR(key);
+ WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
+
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker created bloom filter %s. "
+ "Expected %" PRIu64 " items, got %" PRIu64,
+ chunk->bloom_uri, chunk->count, insert_count));
+
+ /* Ensure the bloom filter is in the metadata. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+ F_SET(chunk, WT_LSM_CHUNK_BLOOM);
+ ret = __wt_lsm_meta_write(session, lsm_tree);
+ ++lsm_tree->dsk_gen;
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");
+
+err: if (bloom != NULL)
+ WT_TRET(__wt_bloom_close(bloom));
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ return (ret);
+}
+
+/*
+ * __lsm_discard_handle --
+ * Try to discard a handle from cache.
+ */
+static int
+__lsm_discard_handle(
+ WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
+{
+ /* This will fail with EBUSY if the file is still in use. */
+ WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL,
+ WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+
+ F_SET(session->dhandle, WT_DHANDLE_DISCARD);
+ return (__wt_session_release_btree(session));
+}
+
+/*
+ * __lsm_drop_file --
+ * Helper function to drop part of an LSM tree.
+ */
+static int
+__lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_DECL_RET;
+ const char *drop_cfg[] = {
+ WT_CONFIG_BASE(session, session_drop), "remove_files=false", NULL
+ };
+
+ /*
+ * We need to grab the schema lock to drop the file, so first try to
+ * make sure there is minimal work to freeing space in the cache. Only
+ * bother trying to discard the checkpoint handle: the in-memory handle
+ * should have been closed already.
+ *
+ * This will fail with EBUSY if the file is still in use.
+ */
+ WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT));
+
+ /*
+ * Take the schema lock for the drop operation. Since __wt_schema_drop
+ * results in the hot backup lock being taken when it updates the
+ * metadata (which would be too late to prevent our drop).
+ */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_drop(session, uri, drop_cfg));
+
+ if (ret == 0)
+ ret = __wt_remove(session, uri + strlen("file:"));
+ WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri));
+
+ if (ret == EBUSY || ret == ENOENT)
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "LSM worker drop of %s failed with %d", uri, ret));
+
+ return (ret);
+}
+
+/*
+ * __wt_lsm_free_chunks --
+ * Try to drop chunks from the tree that are no longer required.
+ */
+int
+__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORKER_COOKIE cookie;
+ u_int i, skipped;
+ int flush_metadata, drop_ret;
+
+ flush_metadata = 0;
+
+ if (lsm_tree->nold_chunks == 0)
+ return (0);
+
+ /*
+ * Make sure only a single thread is freeing the old chunk array
+ * at any time.
+ */
+ if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1))
+ return (0);
+ /*
+ * Take a copy of the current state of the LSM tree and look for chunks
+ * to drop. We do it this way to avoid holding the LSM tree lock while
+ * doing I/O or waiting on the schema lock.
+ *
+ * This is safe because only one thread will be in this function at a
+ * time. Merges may complete concurrently, and the old_chunks array
+ * may be extended, but we shuffle down the pointers each time we free
+ * one to keep the non-NULL slots at the beginning of the array.
+ */
+ WT_CLEAR(cookie);
+ WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1));
+ for (i = skipped = 0; i < cookie.nchunks; i++) {
+ chunk = cookie.chunk_array[i];
+ WT_ASSERT(session, chunk != NULL);
+ /* Skip the chunk if another worker is using it. */
+ if (chunk->refcnt > 1) {
+ ++skipped;
+ continue;
+ }
+
+ /*
+ * Don't remove files if a hot backup is in progress.
+ *
+ * The schema lock protects the set of live files, this check
+ * prevents us from removing a file that hot backup already
+ * knows about.
+ */
+ if (S2C(session)->hot_backup != 0)
+ break;
+
+ /*
+ * Drop any bloom filters and chunks we can. Don't try to drop
+ * a chunk if the bloom filter drop fails.
+ * An EBUSY return indicates that a cursor is still open in
+ * the tree - move to the next chunk in that case.
+ * An ENOENT return indicates that the LSM tree metadata was
+ * out of sync with the on disk state. Update the
+ * metadata to match in that case.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
+ drop_ret = __lsm_drop_file(session, chunk->bloom_uri);
+ if (drop_ret == EBUSY) {
+ ++skipped;
+ continue;
+ } else if (drop_ret != ENOENT)
+ WT_ERR(drop_ret);
+
+ flush_metadata = 1;
+ F_CLR(chunk, WT_LSM_CHUNK_BLOOM);
+ }
+ if (chunk->uri != NULL) {
+ drop_ret = __lsm_drop_file(session, chunk->uri);
+ if (drop_ret == EBUSY) {
+ ++skipped;
+ continue;
+ } else if (drop_ret != ENOENT)
+ WT_ERR(drop_ret);
+ flush_metadata = 1;
+ }
+
+ /* Lock the tree to clear out the old chunk information. */
+ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
+
+ /*
+ * The chunk we are looking at should be the first one in the
+ * tree that we haven't already skipped over.
+ */
+ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
+ __wt_free(session, chunk->bloom_uri);
+ __wt_free(session, chunk->uri);
+ __wt_free(session, lsm_tree->old_chunks[skipped]);
+
+ /* Shuffle down to keep all occupied slots at the beginning. */
+ if (--lsm_tree->nold_chunks > skipped) {
+ memmove(lsm_tree->old_chunks + skipped,
+ lsm_tree->old_chunks + skipped + 1,
+ (lsm_tree->nold_chunks - skipped) *
+ sizeof(WT_LSM_CHUNK *));
+ lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
+ }
+
+ WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
+
+ /*
+ * Clear the chunk in the cookie so we don't attempt to
+ * decrement the reference count.
+ */
+ cookie.chunk_array[i] = NULL;
+ }
+
+err: /* Flush the metadata unless the system is in panic */
+ if (flush_metadata && ret != WT_PANIC) {
+ WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree));
+ WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
+ WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
+ }
+ __lsm_unpin_chunks(session, &cookie);
+ __wt_free(session, cookie.chunk_array);
+ lsm_tree->freeing_old_chunks = 0;
+
+ /* Returning non-zero means there is no work to do. */
+ if (!flush_metadata)
+ WT_TRET(WT_NOTFOUND);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_worker.c b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
new file mode 100644
index 00000000000..f24e58148b1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_worker.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __lsm_worker_general_op(
+ WT_SESSION_IMPL *, WT_LSM_WORKER_ARGS *, int *);
+static void * __lsm_worker(void *);
+
+/*
+ * __wt_lsm_worker_start --
+ * A wrapper around the LSM worker thread start.
+ */
+int
+__wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args)
+{
+ WT_RET(__wt_verbose(session, WT_VERB_LSM,
+ "Start LSM worker %d type 0x%x", args->id, args->type));
+ return (__wt_thread_create(session, &args->tid, __lsm_worker, args));
+}
+
+/*
+ * __lsm_worker_general_op --
+ * Execute a single bloom, drop or flush work unit.
+ */
+static int
+__lsm_worker_general_op(
+ WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *cookie, int *completed)
+{
+ WT_DECL_RET;
+ WT_LSM_CHUNK *chunk;
+ WT_LSM_WORK_UNIT *entry;
+ int force;
+
+ *completed = 0;
+ /*
+ * Return if this thread cannot process a bloom, drop or flush.
+ */
+ if (!FLD_ISSET(cookie->type,
+ WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH))
+ return (WT_NOTFOUND);
+
+ if ((ret = __wt_lsm_manager_pop_entry(session,
+ cookie->type, &entry)) != 0 || entry == NULL)
+ return (ret);
+
+ if (entry->type == WT_LSM_WORK_FLUSH) {
+ force = F_ISSET(entry, WT_LSM_WORK_FORCE);
+ F_CLR(entry, WT_LSM_WORK_FORCE);
+ WT_ERR(__wt_lsm_get_chunk_to_flush(session,
+ entry->lsm_tree, force, &chunk));
+ /*
+ * If we got a chunk to flush, checkpoint it.
+ */
+ if (chunk != NULL) {
+ WT_ERR(__wt_verbose(session, WT_VERB_LSM,
+ "Flush%s chunk %d %s",
+ force ? " w/ force" : "",
+ chunk->id, chunk->uri));
+ ret = __wt_lsm_checkpoint_chunk(
+ session, entry->lsm_tree, chunk);
+ WT_ASSERT(session, chunk->refcnt > 0);
+ (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ WT_ERR(ret);
+ }
+ } else if (entry->type == WT_LSM_WORK_DROP)
+ WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree));
+ else if (entry->type == WT_LSM_WORK_BLOOM)
+ WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree));
+ *completed = 1;
+
+err: __wt_lsm_manager_free_work_unit(session, entry);
+ return (ret);
+}
+
+/*
+ * __lsm_worker --
+ * A thread that executes work units for all open LSM trees.
+ */
+static void *
+__lsm_worker(void *arg)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LSM_WORK_UNIT *entry;
+ WT_LSM_WORKER_ARGS *cookie;
+ WT_SESSION_IMPL *session;
+ int progress, ran;
+
+ cookie = (WT_LSM_WORKER_ARGS *)arg;
+ session = cookie->session;
+ conn = S2C(session);
+
+ entry = NULL;
+ while (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
+ F_ISSET(cookie, WT_LSM_WORKER_RUN)) {
+ progress = 0;
+
+ /*
+ * Workers process the different LSM work queues. Some workers
+ * can handle several or all work unit types. So the code is
+ * prioritized so important operations happen first.
+ * Switches are the highest priority.
+ */
+ while (FLD_ISSET(cookie->type, WT_LSM_WORK_SWITCH) &&
+ (ret = __wt_lsm_manager_pop_entry(
+ session, WT_LSM_WORK_SWITCH, &entry)) == 0 &&
+ entry != NULL)
+ WT_ERR(
+ __wt_lsm_work_switch(session, &entry, &progress));
+ /* Flag an error if the pop failed. */
+ WT_ERR(ret);
+
+ /*
+ * Next the general operations.
+ */
+ ret = __lsm_worker_general_op(session, cookie, &ran);
+ if (ret == EBUSY || ret == WT_NOTFOUND)
+ ret = 0;
+ WT_ERR(ret);
+ progress = progress || ran;
+
+ /*
+ * Finally see if there is any merge work we can do. This is
+ * last because the earlier operations may result in adding
+ * merge work to the queue.
+ */
+ if (FLD_ISSET(cookie->type, WT_LSM_WORK_MERGE) &&
+ (ret = __wt_lsm_manager_pop_entry(
+ session, WT_LSM_WORK_MERGE, &entry)) == 0 &&
+ entry != NULL) {
+ WT_ASSERT(session, entry->type == WT_LSM_WORK_MERGE);
+ ret = __wt_lsm_merge(session,
+ entry->lsm_tree, cookie->id);
+ if (ret == WT_NOTFOUND) {
+ F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING);
+ ret = 0;
+ } else if (ret == EBUSY)
+ ret = 0;
+ /* Clear any state */
+ WT_CLEAR_BTREE_IN_SESSION(session);
+ __wt_lsm_manager_free_work_unit(session, entry);
+ entry = NULL;
+ progress = 1;
+ }
+ /* Flag an error if the pop failed. */
+ WT_ERR(ret);
+
+ /* Don't busy wait if there was any work to do. */
+ if (!progress) {
+ WT_ERR(
+ __wt_cond_wait(session, cookie->work_cond, 10000));
+ continue;
+ }
+ }
+
+ if (ret != 0) {
+err: __wt_lsm_manager_free_work_unit(session, entry);
+ __wt_err(session, ret,
+ "Error in LSM worker thread %d", cookie->id);
+ }
+ return (NULL);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c
new file mode 100644
index 00000000000..313516148c0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_apply.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_meta_btree_apply --
+ * Apply a function to all files listed in the metadata, apart from the
+ * metadata file.
+ */
+int
+__wt_meta_btree_apply(WT_SESSION_IMPL *session,
+ int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
+{
+ WT_CURSOR *cursor;
+ WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
+ const char *uri;
+ int cmp, tret;
+
+ saved_dhandle = session->dhandle;
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, "file:");
+ if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+ tret = cursor->next(cursor);
+ for (; tret == 0; tret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ if (!WT_PREFIX_MATCH(uri, "file:"))
+ break;
+ else if (strcmp(uri, WT_METAFILE_URI) == 0)
+ continue;
+
+ /*
+ * We need to pull the handle into the session handle cache
+ * and make sure it's referenced to stop other internal code
+ * dropping the handle (e.g in LSM when cleaning up obsolete
+ * chunks). Holding the metadata lock isn't enough.
+ */
+ ret = __wt_session_get_btree(session, uri, NULL, NULL, 0);
+ if (ret == 0) {
+ ret = func(session, cfg);
+ if (WT_META_TRACKING(session))
+ WT_TRET(
+ __wt_meta_track_handle_lock(session, 0));
+ else
+ WT_TRET(__wt_session_release_btree(session));
+ } else if (ret == EBUSY)
+ ret = __wt_conn_btree_apply_single(
+ session, uri, NULL, func, cfg);
+ WT_ERR(ret);
+ }
+
+ if (tret != WT_NOTFOUND)
+ WT_TRET(tret);
+err: WT_TRET(cursor->close(cursor));
+ session->dhandle = saved_dhandle;
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ckpt.c b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
new file mode 100644
index 00000000000..998ae7e0d02
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_ckpt.c
@@ -0,0 +1,528 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __ckpt_last(WT_SESSION_IMPL *, const char *, WT_CKPT *);
+static int __ckpt_last_name(WT_SESSION_IMPL *, const char *, const char **);
+static int __ckpt_load(WT_SESSION_IMPL *,
+ WT_CONFIG_ITEM *, WT_CONFIG_ITEM *, WT_CKPT *);
+static int __ckpt_named(
+ WT_SESSION_IMPL *, const char *, const char *, WT_CKPT *);
+static int __ckpt_set(WT_SESSION_IMPL *, const char *, const char *);
+static int __ckpt_version_chk(WT_SESSION_IMPL *, const char *, const char *);
+
+/*
+ * __wt_meta_checkpoint --
+ * Return a file's checkpoint information.
+ */
+int
+__wt_meta_checkpoint(WT_SESSION_IMPL *session,
+ const char *fname, const char *checkpoint, WT_CKPT *ckpt)
+{
+ WT_DECL_RET;
+ const char *config;
+
+ config = NULL;
+
+ /* Retrieve the metadata entry for the file. */
+ WT_ERR(__wt_metadata_search(session, fname, &config));
+
+ /* Check the major/minor version numbers. */
+ WT_ERR(__ckpt_version_chk(session, fname, config));
+
+ /*
+ * Retrieve the named checkpoint or the last checkpoint.
+ *
+ * If we don't find a named checkpoint, we're done, they're read-only.
+ * If we don't find a default checkpoint, it's creation, return "no
+ * data" and let our caller handle it.
+ */
+ if (checkpoint == NULL) {
+ if ((ret = __ckpt_last(session, config, ckpt)) == WT_NOTFOUND) {
+ ret = 0;
+ ckpt->addr.data = ckpt->raw.data = NULL;
+ ckpt->addr.size = ckpt->raw.size = 0;
+ }
+ } else
+ WT_ERR(__ckpt_named(session, checkpoint, config, ckpt));
+
+err: __wt_free(session, config);
+ return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_last_name --
+ * Return the last unnamed checkpoint's name.
+ */
+int
+__wt_meta_checkpoint_last_name(
+ WT_SESSION_IMPL *session, const char *fname, const char **namep)
+{
+ WT_DECL_RET;
+ const char *config;
+
+ config = NULL;
+
+ /* Retrieve the metadata entry for the file. */
+ WT_ERR(__wt_metadata_search(session, fname, &config));
+
+ /* Check the major/minor version numbers. */
+ WT_ERR(__ckpt_version_chk(session, fname, config));
+
+ /* Retrieve the name of the last unnamed checkpoint. */
+ WT_ERR(__ckpt_last_name(session, config, namep));
+
+err: __wt_free(session, config);
+ return (ret);
+}
+
+/*
+ * __wt_meta_checkpoint_clear --
+ * Clear a file's checkpoint.
+ */
+int
+__wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname)
+{
+ /*
+ * If we are unrolling a failed create, we may have already removed the
+ * metadata entry. If no entry is found to update and we're trying to
+ * clear the checkpoint, just ignore it.
+ */
+ WT_RET_NOTFOUND_OK(__ckpt_set(session, fname, NULL));
+
+ return (0);
+}
+
+/*
+ * __ckpt_set --
+ * Set a file's checkpoint.
+ */
+static int
+__ckpt_set(WT_SESSION_IMPL *session, const char *fname, const char *v)
+{
+ WT_DECL_RET;
+ const char *config, *cfg[3], *newcfg;
+
+ config = newcfg = NULL;
+
+ /* Retrieve the metadata for this file. */
+ WT_ERR(__wt_metadata_search(session, fname, &config));
+
+ /* Replace the checkpoint entry. */
+ cfg[0] = config;
+ cfg[1] = v == NULL ? "checkpoint=()" : v;
+ cfg[2] = NULL;
+ WT_ERR(__wt_config_collapse(session, cfg, &newcfg));
+ WT_ERR(__wt_metadata_update(session, fname, newcfg));
+
+err: __wt_free(session, config);
+ __wt_free(session, newcfg);
+ return (ret);
+}
+
+/*
+ * __ckpt_named --
+ * Return the information associated with a file's named checkpoint.
+ */
+static int
+__ckpt_named(WT_SESSION_IMPL *session,
+ const char *checkpoint, const char *config, WT_CKPT *ckpt)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM k, v;
+
+ WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
+ WT_RET(__wt_config_subinit(session, &ckptconf, &v));
+
+ /*
+ * Take the first match: there should never be more than a single
+ * checkpoint of any name.
+ */
+ while (__wt_config_next(&ckptconf, &k, &v) == 0)
+ if (WT_STRING_MATCH(checkpoint, k.str, k.len))
+ return (__ckpt_load(session, &k, &v, ckpt));
+
+ return (WT_NOTFOUND);
+}
+
+/*
+ * __ckpt_last --
+ * Return the information associated with the file's last checkpoint.
+ */
+static int
+__ckpt_last(WT_SESSION_IMPL *session, const char *config, WT_CKPT *ckpt)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM a, k, v;
+ int64_t found;
+
+ WT_RET(__wt_config_getones(session, config, "checkpoint", &v));
+ WT_RET(__wt_config_subinit(session, &ckptconf, &v));
+ for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
+ /* Ignore checkpoints before the ones we've already seen. */
+ WT_RET(__wt_config_subgets(session, &v, "order", &a));
+ if (found) {
+ if (a.val < found)
+ continue;
+ __wt_meta_checkpoint_free(session, ckpt);
+ }
+ found = a.val;
+ WT_RET(__ckpt_load(session, &k, &v, ckpt));
+ }
+
+ return (found ? 0 : WT_NOTFOUND);
+}
+
+/*
+ * __ckpt_last_name --
+ * Return the name associated with the file's last unnamed checkpoint.
+ */
+static int
+__ckpt_last_name(
+ WT_SESSION_IMPL *session, const char *config, const char **namep)
+{
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM a, k, v;
+ WT_DECL_RET;
+ int64_t found;
+
+ *namep = NULL;
+
+ WT_ERR(__wt_config_getones(session, config, "checkpoint", &v));
+ WT_ERR(__wt_config_subinit(session, &ckptconf, &v));
+ for (found = 0; __wt_config_next(&ckptconf, &k, &v) == 0;) {
+ /*
+ * We only care about unnamed checkpoints; applications may not
+ * use any matching prefix as a checkpoint name, the comparison
+ * is pretty simple.
+ */
+ if (k.len < strlen(WT_CHECKPOINT) ||
+ strncmp(k.str, WT_CHECKPOINT, strlen(WT_CHECKPOINT)) != 0)
+ continue;
+
+ /* Ignore checkpoints before the ones we've already seen. */
+ WT_ERR(__wt_config_subgets(session, &v, "order", &a));
+ if (found && a.val < found)
+ continue;
+
+ if (*namep != NULL)
+ __wt_free(session, *namep);
+ WT_ERR(__wt_strndup(session, k.str, k.len, namep));
+ found = a.val;
+ }
+ if (!found)
+ ret = WT_NOTFOUND;
+
+ if (0) {
+err: __wt_free(session, namep);
+ }
+ return (ret);
+}
+
+/*
+ * __ckpt_compare_order --
+ * Qsort comparison routine for the checkpoint list.
+ */
+static int
+__ckpt_compare_order(const void *a, const void *b)
+{
+ WT_CKPT *ackpt, *bckpt;
+
+ ackpt = (WT_CKPT *)a;
+ bckpt = (WT_CKPT *)b;
+
+ return (ackpt->order > bckpt->order ? 1 : -1);
+}
+
+/*
+ * __wt_meta_ckptlist_get --
+ * Load all available checkpoint information for a file.
+ */
+int
+__wt_meta_ckptlist_get(
+ WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep)
+{
+ WT_CKPT *ckpt, *ckptbase;
+ WT_CONFIG ckptconf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ size_t allocated, slot;
+ const char *config;
+
+ *ckptbasep = NULL;
+
+ ckptbase = NULL;
+ allocated = slot = 0;
+ config = NULL;
+
+ /* Retrieve the metadata information for the file. */
+ WT_RET(__wt_metadata_search(session, fname, &config));
+
+ /* Load any existing checkpoints into the array. */
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ if (__wt_config_getones(session, config, "checkpoint", &v) == 0 &&
+ __wt_config_subinit(session, &ckptconf, &v) == 0)
+ for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) {
+ WT_ERR(__wt_realloc_def(
+ session, &allocated, slot + 1, &ckptbase));
+ ckpt = &ckptbase[slot];
+
+ WT_ERR(__ckpt_load(session, &k, &v, ckpt));
+ }
+
+ /*
+ * Allocate an extra slot for a new value, plus a slot to mark the end.
+ *
+ * This isn't very clean, but there's necessary cooperation between the
+ * schema layer (that maintains the list of checkpoints), the btree
+ * layer (that knows when the root page is written, creating a new
+ * checkpoint), and the block manager (which actually creates the
+ * checkpoint). All of that cooperation is handled in the WT_CKPT
+ * structure referenced from the WT_BTREE structure.
+ */
+ WT_ERR(__wt_realloc_def(session, &allocated, slot + 2, &ckptbase));
+
+ /* Sort in creation-order. */
+ qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order);
+
+ /* Return the array to our caller. */
+ *ckptbasep = ckptbase;
+
+ if (0) {
+err: __wt_meta_ckptlist_free(session, ckptbase);
+ }
+ __wt_free(session, config);
+ __wt_scr_free(&buf);
+
+ return (ret);
+}
+
+/*
+ * __ckpt_load --
+ * Load a single checkpoint's information into a WT_CKPT structure.
+ */
+static int
+__ckpt_load(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt)
+{
+ WT_CONFIG_ITEM a;
+ char timebuf[64];
+
+ /*
+ * Copy the name, address (raw and hex), order and time into the slot.
+ * If there's no address, it's a fake.
+ */
+ WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name));
+
+ WT_RET(__wt_config_subgets(session, v, "addr", &a));
+ WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len));
+ if (a.len == 0)
+ F_SET(ckpt, WT_CKPT_FAKE);
+ else
+ WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw));
+
+ WT_RET(__wt_config_subgets(session, v, "order", &a));
+ if (a.len == 0)
+ goto format;
+ ckpt->order = a.val;
+
+ WT_RET(__wt_config_subgets(session, v, "time", &a));
+ if (a.len == 0 || a.len > sizeof(timebuf) - 1)
+ goto format;
+ memcpy(timebuf, a.str, a.len);
+ timebuf[a.len] = '\0';
+ if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1)
+ goto format;
+
+ WT_RET(__wt_config_subgets(session, v, "size", &a));
+ ckpt->ckpt_size = (uint64_t)a.val;
+
+ WT_RET(__wt_config_subgets(session, v, "write_gen", &a));
+ if (a.len == 0)
+ goto format;
+ /*
+ * The largest value a WT_CONFIG_ITEM can handle is signed: this value
+ * appears on disk and I don't want to sign it there, so I'm casting it
+ * here instead.
+ */
+ ckpt->write_gen = (uint64_t)a.val;
+
+ return (0);
+
+format:
+ WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list");
+}
+
+/*
+ * __wt_meta_ckptlist_set --
+ * Set a file's checkpoint value from the WT_CKPT list.
+ */
+int
+__wt_meta_ckptlist_set(WT_SESSION_IMPL *session,
+ const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn)
+{
+ WT_CKPT *ckpt;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ time_t secs;
+ int64_t maxorder;
+ const char *sep;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ maxorder = 0;
+ sep = "";
+ WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=("));
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ /*
+ * Each internal checkpoint name is appended with a generation
+ * to make it a unique name. We're solving two problems: when
+ * two checkpoints are taken quickly, the timer may not be
+ * unique and/or we can even see time travel on the second
+ * checkpoint if we snapshot the time in-between nanoseconds
+ * rolling over. Second, if we reset the generational counter
+ * when new checkpoints arrive, we could logically re-create
+ * specific checkpoints, racing with cursors open on those
+ * checkpoints. I can't think of any way to return incorrect
+ * results by racing with those cursors, but it's simpler not
+ * to worry about it.
+ */
+ if (ckpt->order > maxorder)
+ maxorder = ckpt->order;
+
+ /* Skip deleted checkpoints. */
+ if (F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) {
+ /*
+ * We fake checkpoints for handles in the middle of a
+ * bulk load. If there is a checkpoint, convert the
+ * raw cookie to a hex string.
+ */
+ if (ckpt->raw.size == 0)
+ ckpt->addr.size = 0;
+ else
+ WT_ERR(__wt_raw_to_hex(session,
+ ckpt->raw.data,
+ ckpt->raw.size, &ckpt->addr));
+
+ /* Set the order and timestamp. */
+ if (F_ISSET(ckpt, WT_CKPT_ADD))
+ ckpt->order = ++maxorder;
+
+ /*
+ * XXX
+ * Assumes a time_t fits into a uintmax_t, which isn't
+ * guaranteed, a time_t has to be an arithmetic type,
+ * but not an integral type.
+ */
+ WT_ERR(__wt_seconds(session, &secs));
+ ckpt->sec = (uintmax_t)secs;
+ }
+ if (strcmp(ckpt->name, WT_CHECKPOINT) == 0)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRIu64
+ ",time=%" PRIuMAX ",size=%" PRIu64
+ ",write_gen=%" PRIu64 ")",
+ sep, ckpt->name, ckpt->order,
+ (int)ckpt->addr.size, (char *)ckpt->addr.data,
+ ckpt->order, ckpt->sec, ckpt->ckpt_size,
+ ckpt->write_gen));
+ else
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ "%s%s=(addr=\"%.*s\",order=%" PRIu64
+ ",time=%" PRIuMAX ",size=%" PRIu64
+ ",write_gen=%" PRIu64 ")",
+ sep, ckpt->name,
+ (int)ckpt->addr.size, (char *)ckpt->addr.data,
+ ckpt->order, ckpt->sec, ckpt->ckpt_size,
+ ckpt->write_gen));
+ sep = ",";
+ }
+ WT_ERR(__wt_buf_catfmt(session, buf, ")"));
+ if (ckptlsn != NULL)
+ WT_ERR(__wt_buf_catfmt(session, buf,
+ ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")",
+ ckptlsn->file, (uintmax_t)ckptlsn->offset));
+ WT_ERR(__ckpt_set(session, fname, buf->mem));
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_meta_ckptlist_free --
+ * Discard the checkpoint array.
+ */
+void
+__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+{
+ WT_CKPT *ckpt;
+
+ if (ckptbase == NULL)
+ return;
+
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ __wt_meta_checkpoint_free(session, ckpt);
+ __wt_free(session, ckptbase);
+}
+
+/*
+ * __wt_meta_checkpoint_free --
+ * Clean up a single checkpoint structure.
+ */
+void
+__wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
+{
+ if (ckpt == NULL)
+ return;
+
+ __wt_free(session, ckpt->name);
+ __wt_buf_free(session, &ckpt->addr);
+ __wt_buf_free(session, &ckpt->raw);
+ __wt_free(session, ckpt->bpriv);
+
+ WT_CLEAR(*ckpt); /* Clear to prepare for re-use. */
+}
+
+/*
+ * __ckpt_version_chk --
+ * Check the version major/minor numbers.
+ */
+static int
+__ckpt_version_chk(
+ WT_SESSION_IMPL *session, const char *fname, const char *config)
+{
+ WT_CONFIG_ITEM a, v;
+ int majorv, minorv;
+
+ WT_RET(__wt_config_getones(session, config, "version", &v));
+ WT_RET(__wt_config_subgets(session, &v, "major", &a));
+ majorv = (int)a.val;
+ WT_RET(__wt_config_subgets(session, &v, "minor", &a));
+ minorv = (int)a.val;
+
+ if (majorv < WT_BTREE_MAJOR_VERSION_MIN ||
+ majorv > WT_BTREE_MAJOR_VERSION_MAX ||
+ (majorv == WT_BTREE_MAJOR_VERSION_MIN &&
+ minorv < WT_BTREE_MINOR_VERSION_MIN) ||
+ (majorv == WT_BTREE_MAJOR_VERSION_MAX &&
+ minorv > WT_BTREE_MINOR_VERSION_MAX))
+ WT_RET_MSG(session, EACCES,
+ "%s is an unsupported WiredTiger source file version %d.%d"
+ "; this WiredTiger build only supports versions from %d.%d "
+ "to %d.%d",
+ fname,
+ majorv, minorv,
+ WT_BTREE_MAJOR_VERSION_MIN,
+ WT_BTREE_MINOR_VERSION_MIN,
+ WT_BTREE_MAJOR_VERSION_MAX,
+ WT_BTREE_MINOR_VERSION_MAX);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_ext.c b/src/third_party/wiredtiger/src/meta/meta_ext.c
new file mode 100644
index 00000000000..b68058a6e91
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_ext.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_metadata_insert --
+ * Insert a row into the metadata (external API version).
+ */
+int
+__wt_ext_metadata_insert(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, const char *key, const char *value)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_insert(session, key, value));
+}
+
+/*
+ * __wt_ext_metadata_remove --
+ * Remove a row from the metadata (external API version).
+ */
+int
+__wt_ext_metadata_remove(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_remove(session, key));
+}
+
+/*
+ * __wt_ext_metadata_search --
+ * Return a copied row from the metadata (external API version).
+ * The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_ext_metadata_search(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, const char *key, const char **valuep)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_search(session, key, valuep));
+}
+
+/*
+ * __wt_ext_metadata_update --
+ * Update a row in the metadata (external API version).
+ */
+int
+__wt_ext_metadata_update(WT_EXTENSION_API *wt_api,
+ WT_SESSION *wt_session, const char *key, const char *value)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_api->conn;
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = conn->default_session;
+
+ return (__wt_metadata_update(session, key, value));
+}
+
+/*
+ * __wt_metadata_get_ckptlist --
+ * Public entry point to __wt_meta_ckptlist_get (for wt list).
+ */
+int
+__wt_metadata_get_ckptlist(
+ WT_SESSION *session, const char *name, WT_CKPT **ckptbasep)
+{
+ return (__wt_meta_ckptlist_get(
+ (WT_SESSION_IMPL *)session, name, ckptbasep));
+}
+
+/*
+ * __wt_metadata_free_ckptlist --
+ * Public entry point to __wt_meta_ckptlist_free (for wt list).
+ */
+void
+__wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase)
+{
+ __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_table.c b/src/third_party/wiredtiger/src/meta/meta_table.c
new file mode 100644
index 00000000000..e66ed609952
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_table.c
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_turtle --
+ * Return if a key's value should be taken from the turtle file.
+ */
+static int
+__metadata_turtle(const char *key)
+{
+ switch (key[0]) {
+ case 'f':
+ if (strcmp(key, WT_METAFILE_URI) == 0)
+ return (1);
+ break;
+ case 'W':
+ if (strcmp(key, "WiredTiger version") == 0)
+ return (1);
+ if (strcmp(key, "WiredTiger version string") == 0)
+ return (1);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __wt_metadata_open --
+ * Opens the metadata file, sets session->metafile.
+ */
+int
+__wt_metadata_open(WT_SESSION_IMPL *session)
+{
+ if (session->metafile != NULL)
+ return (0);
+
+ WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0));
+
+ session->metafile = S2BT(session);
+ WT_ASSERT(session, session->metafile != NULL);
+
+ /* The metafile doesn't need to stay locked -- release it. */
+ return (__wt_session_release_btree(session));
+}
+
+/*
+ * __wt_metadata_cursor --
+ * Opens a cursor on the metadata.
+ */
+int
+__wt_metadata_cursor(
+ WT_SESSION_IMPL *session, const char *config, WT_CURSOR **cursorp)
+{
+ WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_open_cursor), config, NULL };
+
+ saved_dhandle = session->dhandle;
+ WT_ERR(__wt_metadata_open(session));
+
+ WT_SET_BTREE_IN_SESSION(session, session->metafile);
+
+ /*
+ * We use the metadata a lot, so we have a handle cached; lock it and
+ * increment the in-use counter.
+ */
+ WT_ERR(__wt_session_lock_btree(session, 0));
+ __wt_session_dhandle_incr_use(session);
+
+ ret = __wt_curfile_create(session, NULL, cfg, 0, 0, cursorp);
+
+ /* Restore the caller's btree. */
+err: session->dhandle = saved_dhandle;
+ return (ret);
+}
+
+/*
+ * __wt_metadata_insert --
+ * Insert a row into the metadata.
+ */
+int
+__wt_metadata_insert(
+ WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Insert: key: %s, value: %s, tracking: %s, %s" "turtle",
+ key, value, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ WT_RET_MSG(session, EINVAL,
+ "%s: insert not supported on the turtle file", key);
+
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ WT_ERR(cursor->insert(cursor));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_insert(session, key));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_metadata_update --
+ * Update a row in the metadata.
+ */
+int
+__wt_metadata_update(
+ WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Update: key: %s, value: %s, tracking: %s, %s" "turtle",
+ key, value, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ return (__wt_turtle_update(session, key, value));
+
+ if (WT_META_TRACKING(session))
+ WT_RET(__wt_meta_track_update(session, key));
+
+ WT_RET(__wt_metadata_cursor(session, "overwrite", &cursor));
+ cursor->set_key(cursor, key);
+ cursor->set_value(cursor, value);
+ WT_ERR(cursor->insert(cursor));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_metadata_remove --
+ * Remove a row from the metadata.
+ */
+int
+__wt_metadata_remove(WT_SESSION_IMPL *session, const char *key)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Remove: key: %s, tracking: %s, %s" "turtle",
+ key, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ WT_RET_MSG(session, EINVAL,
+ "%s: remove not supported on the turtle file", key);
+
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, key);
+ WT_ERR(cursor->search(cursor));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_update(session, key));
+ WT_ERR(cursor->remove(cursor));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_metadata_search --
+ * Return a copied row from the metadata.
+ * The caller is responsible for freeing the allocated memory.
+ */
+int
+__wt_metadata_search(
+ WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *value;
+
+ *valuep = NULL;
+
+ WT_RET(__wt_verbose(session, WT_VERB_METADATA,
+ "Search: key: %s, tracking: %s, %s" "turtle",
+ key, WT_META_TRACKING(session) ? "true" : "false",
+ __metadata_turtle(key) ? "" : "not "));
+
+ if (__metadata_turtle(key))
+ return (__wt_turtle_read(session, key, valuep));
+
+ WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, key);
+ WT_ERR(cursor->search(cursor));
+ WT_ERR(cursor->get_value(cursor, &value));
+ WT_ERR(__wt_strdup(session, value, valuep));
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
new file mode 100644
index 00000000000..55e61f8d1bc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -0,0 +1,365 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * WT_META_TRACK -- A tracked metadata operation: a non-transactional log,
+ * maintained to make it easy to unroll simple metadata and filesystem
+ * operations.
+ */
+typedef struct __wt_meta_track {
+ enum {
+ WT_ST_EMPTY, /* Unused slot */
+ WT_ST_CHECKPOINT, /* Complete a checkpoint */
+ WT_ST_FILEOP, /* File operation */
+ WT_ST_LOCK, /* Lock a handle */
+ WT_ST_REMOVE, /* Remove a metadata entry */
+ WT_ST_SET /* Reset a metadata entry */
+ } op;
+ const char *a, *b; /* Strings */
+ WT_BTREE *btree; /* Locked handle */
+ int created; /* Handle on newly created file */
+} WT_META_TRACK;
+
+/*
+ * __meta_track_next --
+ * Extend the list of operations we're tracking, as necessary, and
+ * optionally return the next slot.
+ */
+static int
+__meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp)
+{
+ size_t offset, sub_off;
+
+ if (session->meta_track_next == NULL)
+ session->meta_track_next = session->meta_track;
+
+ offset = WT_PTRDIFF(session->meta_track_next, session->meta_track);
+ sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track);
+ if (offset == session->meta_track_alloc) {
+ WT_RET(__wt_realloc(session, &session->meta_track_alloc,
+ WT_MAX(2 * session->meta_track_alloc,
+ 20 * sizeof(WT_META_TRACK)), &session->meta_track));
+
+ /* Maintain positions in the new chunk of memory. */
+ session->meta_track_next =
+ (uint8_t *)session->meta_track + offset;
+ if (session->meta_track_sub != NULL)
+ session->meta_track_sub =
+ (uint8_t *)session->meta_track + sub_off;
+ }
+
+ WT_ASSERT(session, session->meta_track_next != NULL);
+
+ if (trkp != NULL) {
+ *trkp = session->meta_track_next;
+ session->meta_track_next = *trkp + 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_meta_track_discard --
+ * Cleanup metadata tracking when closing a session.
+ */
+void
+__wt_meta_track_discard(WT_SESSION_IMPL *session)
+{
+ __wt_free(session, session->meta_track);
+ session->meta_track_next = NULL;
+ session->meta_track_alloc = 0;
+}
+
+/*
+ * __wt_meta_track_on --
+ * Turn on metadata operation tracking.
+ */
+int
+__wt_meta_track_on(WT_SESSION_IMPL *session)
+{
+ if (session->meta_track_nest++ == 0)
+ WT_RET(__meta_track_next(session, NULL));
+
+ return (0);
+}
+
+/*
+ * __meta_track_apply --
+ * Apply the changes in a metadata tracking record.
+ */
+static int
+__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
+{
+ WT_BM *bm;
+ WT_DECL_RET;
+ int tret;
+
+ /*
+ * Unlock handles and complete checkpoints regardless of whether we are
+ * unrolling.
+ */
+ if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK)
+ goto free;
+
+ switch (trk->op) {
+ case WT_ST_EMPTY: /* Unused slot */
+ break;
+ case WT_ST_CHECKPOINT: /* Checkpoint, see above */
+ if (!unroll) {
+ bm = trk->btree->bm;
+ WT_WITH_BTREE(session, trk->btree,
+ WT_TRET(bm->checkpoint_resolve(bm, session)));
+ }
+ break;
+ case WT_ST_LOCK: /* Handle lock, see above */
+ if (unroll && trk->created)
+ F_SET(trk->btree->dhandle, WT_DHANDLE_DISCARD);
+ WT_WITH_BTREE(session, trk->btree,
+ WT_TRET(__wt_session_release_btree(session)));
+ break;
+ case WT_ST_FILEOP: /* File operation */
+ /*
+ * For renames, both a and b are set.
+ * For creates, a is NULL.
+ * For removes, b is NULL.
+ */
+ if (trk->a != NULL && trk->b != NULL &&
+ (tret = __wt_rename(session,
+ trk->b + strlen("file:"),
+ trk->a + strlen("file:"))) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll rename %s to %s",
+ trk->b, trk->a);
+ WT_TRET(tret);
+ } else if (trk->a == NULL) {
+ if ((tret = __wt_remove(session,
+ trk->b + strlen("file:"))) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll create %s",
+ trk->b);
+ WT_TRET(tret);
+ }
+ }
+ /*
+ * We can't undo removes yet: that would imply
+ * some kind of temporary rename and remove in
+ * roll forward.
+ */
+ break;
+ case WT_ST_REMOVE: /* Remove trk.a */
+ if ((tret = __wt_metadata_remove(session, trk->a)) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll remove: %s",
+ trk->a);
+ WT_TRET(tret);
+ }
+ break;
+ case WT_ST_SET: /* Set trk.a to trk.b */
+ if ((tret = __wt_metadata_update(
+ session, trk->a, trk->b)) != 0) {
+ __wt_err(session, tret,
+ "metadata unroll update %s to %s",
+ trk->a, trk->b);
+ WT_TRET(tret);
+ }
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+free: trk->op = WT_ST_EMPTY;
+ __wt_free(session, trk->a);
+ __wt_free(session, trk->b);
+ trk->btree = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_off --
+ * Turn off metadata operation tracking, unrolling on error.
+ */
+int
+__wt_meta_track_off(WT_SESSION_IMPL *session, int unroll)
+{
+ WT_DECL_RET;
+ WT_META_TRACK *trk, *trk_orig;
+
+ WT_ASSERT(session,
+ WT_META_TRACKING(session) && session->meta_track_nest > 0);
+
+ trk_orig = session->meta_track;
+ trk = session->meta_track_next;
+
+ /* If it was a nested transaction, there is nothing to do. */
+ if (--session->meta_track_nest != 0)
+ return (0);
+
+ /* Turn off tracking for unroll. */
+ session->meta_track_next = session->meta_track_sub = NULL;
+
+ /*
+ * If there were no operations logged, return now and avoid unnecessary
+ * metadata checkpoints. For example, this happens if attempting to
+ * create a data source that already exists (or drop one that doesn't).
+ */
+ if (trk == trk_orig)
+ return (0);
+
+ while (--trk >= trk_orig)
+ WT_TRET(__meta_track_apply(session, trk, unroll));
+
+ /*
+ * If the operation succeeded and we aren't relying on the log for
+ * durability, checkpoint the metadata. */
+ if (!unroll && ret == 0 && session->metafile != NULL &&
+ !S2C(session)->logging)
+ WT_WITH_BTREE(session, session->metafile,
+ ret = __wt_checkpoint(session, NULL));
+
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_sub_on --
+ * Start a group of operations that can be committed independent of the
+ * main transaction.
+ */
+int
+__wt_meta_track_sub_on(WT_SESSION_IMPL *session)
+{
+ WT_ASSERT(session, session->meta_track_sub == NULL);
+ session->meta_track_sub = session->meta_track_next;
+ return (0);
+}
+
+/*
+ * __wt_meta_track_sub_off --
+ * Commit a group of operations independent of the main transaction.
+ */
+int
+__wt_meta_track_sub_off(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ WT_META_TRACK *trk, *trk_orig;
+
+ if (!WT_META_TRACKING(session) || session->meta_track_sub == NULL)
+ return (0);
+
+ trk_orig = session->meta_track_sub;
+ trk = session->meta_track_next;
+
+ /* Turn off tracking for unroll. */
+ session->meta_track_next = session->meta_track_sub = NULL;
+
+ while (--trk >= trk_orig)
+ WT_TRET(__meta_track_apply(session, trk, 0));
+
+ session->meta_track_next = trk_orig;
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_checkpoint --
+ * Track a handle involved in a checkpoint.
+ */
+int
+__wt_meta_track_checkpoint(WT_SESSION_IMPL *session)
+{
+ WT_META_TRACK *trk;
+
+ WT_ASSERT(session, session->dhandle != NULL);
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_CHECKPOINT;
+ trk->btree = S2BT(session);
+ return (0);
+}
+/*
+ * __wt_meta_track_insert --
+ * Track an insert operation.
+ */
+int
+__wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key)
+{
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_REMOVE;
+ WT_RET(__wt_strdup(session, key, &trk->a));
+
+ return (0);
+}
+
+/*
+ * __wt_meta_track_update --
+ * Track a metadata update operation.
+ */
+int
+__wt_meta_track_update(WT_SESSION_IMPL *session, const char *key)
+{
+ WT_DECL_RET;
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_SET;
+ WT_RET(__wt_strdup(session, key, &trk->a));
+
+ /*
+ * If there was a previous value, keep it around -- if not, then this
+ * "update" is really an insert.
+ */
+ if ((ret =
+ __wt_metadata_search(session, key, &trk->b)) == WT_NOTFOUND) {
+ trk->op = WT_ST_REMOVE;
+ ret = 0;
+ }
+ return (ret);
+}
+
+/*
+ * __wt_meta_track_fileop --
+ * Track a filesystem operation.
+ */
+int
+__wt_meta_track_fileop(
+ WT_SESSION_IMPL *session, const char *olduri, const char *newuri)
+{
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_FILEOP;
+ if (olduri != NULL)
+ WT_RET(__wt_strdup(session, olduri, &trk->a));
+ if (newuri != NULL)
+ WT_RET(__wt_strdup(session, newuri, &trk->b));
+ return (0);
+}
+
+/*
+ * __wt_meta_track_handle_lock --
+ * Track a locked handle.
+ */
+int
+__wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created)
+{
+ WT_META_TRACK *trk;
+
+ WT_ASSERT(session, session->dhandle != NULL);
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_LOCK;
+ trk->btree = S2BT(session);
+ trk->created = created;
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c
new file mode 100644
index 00000000000..d6060ebf47b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c
@@ -0,0 +1,318 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __metadata_config --
+ * Return the default configuration information for the metadata file.
+ */
+static int
+__metadata_config(WT_SESSION_IMPL *session, const char **metaconfp)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ const char *cfg[] = { WT_CONFIG_BASE(session, file_meta), NULL, NULL };
+ const char *metaconf;
+
+ *metaconfp = NULL;
+
+ metaconf = NULL;
+
+ /* Create a turtle file with default values. */
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf,
+ "key_format=S,value_format=S,id=%d,version=(major=%d,minor=%d)",
+ WT_METAFILE_ID,
+ WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
+ cfg[1] = buf->data;
+ WT_ERR(__wt_config_collapse(session, cfg, &metaconf));
+
+ *metaconfp = metaconf;
+
+ if (0) {
+err: __wt_free(session, metaconf);
+ }
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __metadata_init --
+ * Create the metadata file.
+ */
+static int
+__metadata_init(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+
+ /*
+ * We're single-threaded, but acquire the schema lock regardless: the
+ * lower level code checks that it is appropriately synchronized.
+ */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_create(session, WT_METAFILE_URI, NULL));
+
+ return (ret);
+}
+
+/*
+ * __metadata_load_hot_backup --
+ * Load the contents of any hot backup file.
+ */
+static int
+__metadata_load_hot_backup(WT_SESSION_IMPL *session)
+{
+ FILE *fp;
+ WT_DECL_ITEM(key);
+ WT_DECL_ITEM(value);
+ WT_DECL_RET;
+ char *path;
+
+ fp = NULL;
+ path = NULL;
+
+ /* Look for a hot backup file: if we find it, load it. */
+ WT_RET(__wt_filename(session, WT_METADATA_BACKUP, &path));
+ fp = fopen(path, "r");
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (0);
+
+ /* Read line pairs and load them into the metadata file. */
+ WT_ERR(__wt_scr_alloc(session, 512, &key));
+ WT_ERR(__wt_scr_alloc(session, 512, &value));
+ for (;;) {
+ WT_ERR(__wt_getline(session, key, fp));
+ if (key->size == 0)
+ break;
+ WT_ERR(__wt_getline(session, value, fp));
+ if (value->size == 0)
+ WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP));
+ WT_ERR(__wt_metadata_update(session, key->data, value->data));
+ }
+
+ F_SET(S2C(session), WT_CONN_WAS_BACKUP);
+
+err: if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ __wt_scr_free(&key);
+ __wt_scr_free(&value);
+ return (ret);
+}
+
+/*
+ * __metadata_load_bulk --
+ * Create any bulk-loaded file stubs.
+ */
+static int
+__metadata_load_bulk(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint32_t allocsize;
+ int exist;
+ const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL };
+ const char *key;
+
+ /*
+ * If a file was being bulk-loaded during the hot backup, it will appear
+ * in the metadata file, but the file won't exist. Create on demand.
+ */
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &key));
+ if (!WT_PREFIX_SKIP(key, "file:"))
+ continue;
+
+ /* If the file exists, it's all good. */
+ WT_ERR(__wt_exist(session, key, &exist));
+ if (exist)
+ continue;
+
+ /*
+ * If the file doesn't exist, assume it's a bulk-loaded file;
+ * retrieve the allocation size and re-create the file.
+ */
+ WT_ERR(__wt_direct_io_size_check(
+ session, filecfg, "allocation_size", &allocsize));
+ WT_ERR(__wt_block_manager_create(session, key, allocsize));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+
+ return (ret);
+}
+
+/*
+ * __wt_turtle_init --
+ * Check the turtle file and create if necessary.
+ */
+int
+__wt_turtle_init(WT_SESSION_IMPL *session)
+{
+ WT_DECL_RET;
+ int exist;
+ const char *metaconf;
+
+ metaconf = NULL;
+
+ /*
+ * Discard any turtle setup file left-over from previous runs. This
+ * doesn't matter for correctness, it's just cleaning up random files.
+ */
+ WT_RET(__wt_exist(session, WT_METADATA_TURTLE_SET, &exist));
+ if (exist)
+ WT_RET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+
+ /*
+ * We could die after creating the turtle file and before creating the
+ * metadata file, or worse, the metadata file might be in some random
+ * state. Make sure that doesn't happen: if we don't find the turtle
+ * file, first create the metadata file, load any hot backup, and then
+ * create the turtle file. No matter what happens, if metadata file
+ * creation doesn't fully complete, we won't have a turtle file and we
+ * will repeat the process until we succeed.
+ *
+ * If there's already a turtle file, we're done.
+ */
+ WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist));
+ if (exist)
+ return (0);
+
+ /* Create the metadata file. */
+ WT_RET(__metadata_init(session));
+
+ /* Load any hot-backup information. */
+ WT_RET(__metadata_load_hot_backup(session));
+
+ /* Create any bulk-loaded file stubs. */
+ WT_RET(__metadata_load_bulk(session));
+
+ /* Create the turtle file. */
+ WT_RET(__metadata_config(session, &metaconf));
+ WT_ERR(__wt_turtle_update(session, WT_METAFILE_URI, metaconf));
+
+ /* Remove the backup file if it exists, we'll never read it again. */
+ WT_ERR(__wt_exist(session, WT_METADATA_BACKUP, &exist));
+ if (exist)
+ WT_ERR(__wt_remove(session, WT_METADATA_BACKUP));
+
+err: __wt_free(session, metaconf);
+ return (ret);
+}
+
+/*
+ * __wt_turtle_read --
+ * Read the turtle file.
+ */
+int
+__wt_turtle_read(WT_SESSION_IMPL *session, const char *key, const char **valuep)
+{
+ FILE *fp;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ int match;
+ char *path;
+
+ *valuep = NULL;
+
+ fp = NULL;
+ path = NULL;
+
+ /*
+ * Open the turtle file; there's one case where we won't find the turtle
+ * file, yet still succeed. We create the metadata file before creating
+ * the turtle file, and that means returning the default configuration
+ * string for the metadata file.
+ */
+ WT_RET(__wt_filename(session, WT_METADATA_TURTLE, &path));
+ if ((fp = fopen(path, "r")) == NULL)
+ ret = __wt_errno();
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (strcmp(key, WT_METAFILE_URI) == 0 ?
+ __metadata_config(session, valuep) : ret);
+
+ /* Search for the key. */
+ WT_ERR(__wt_scr_alloc(session, 512, &buf));
+ for (match = 0;;) {
+ WT_ERR(__wt_getline(session, buf, fp));
+ if (buf->size == 0)
+ WT_ERR(WT_NOTFOUND);
+ if (strcmp(key, buf->data) == 0)
+ match = 1;
+
+ /* Key matched: read the subsequent line for the value. */
+ WT_ERR(__wt_getline(session, buf, fp));
+ if (buf->size == 0)
+ WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE));
+ if (match)
+ break;
+ }
+
+ /* Copy the value for the caller. */
+ WT_ERR(__wt_strdup(session, buf->data, valuep));
+
+err: if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_turtle_update --
+ * Update the turtle file.
+ */
+int
+__wt_turtle_update(
+ WT_SESSION_IMPL *session, const char *key, const char *value)
+{
+ FILE *fp;
+ WT_DECL_RET;
+ int vmajor, vminor, vpatch;
+ const char *version;
+ char *path;
+
+ fp = NULL;
+ path = NULL;
+
+ /*
+ * Create the turtle setup file: we currently re-write it from scratch
+ * every time.
+ */
+ WT_RET(__wt_filename(session, WT_METADATA_TURTLE_SET, &path));
+ if ((fp = fopen(path, "w")) == NULL)
+ ret = __wt_errno();
+ __wt_free(session, path);
+ if (fp == NULL)
+ return (ret);
+
+ version = wiredtiger_version(&vmajor, &vminor, &vpatch);
+ WT_ERR_TEST((fprintf(fp,
+ "%s\n%s\n%s\n" "major=%d,minor=%d,patch=%d\n%s\n%s\n",
+ WT_METADATA_VERSION_STR, version,
+ WT_METADATA_VERSION, vmajor, vminor, vpatch,
+ key, value) < 0), __wt_errno());
+
+ ret = fclose(fp);
+ fp = NULL;
+ WT_ERR_TEST(ret == EOF, __wt_errno());
+
+ WT_ERR(
+ __wt_rename(session, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE));
+
+ if (0) {
+err: WT_TRET(__wt_remove(session, WT_METADATA_TURTLE_SET));
+ }
+
+ if (fp != NULL)
+ WT_TRET(fclose(fp) == 0 ? 0 : __wt_errno());
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_abort.c b/src/third_party/wiredtiger/src/os_posix/os_abort.c
new file mode 100644
index 00000000000..3d99ffe20b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_abort.c
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_abort --
+ * Abort the process, dropping core.
+ */
+void
+__wt_abort(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_ATTRIBUTE((noreturn))
+{
+ __wt_errx(session, "aborting WiredTiger library");
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_attach(session);
+#endif
+
+ abort();
+ /* NOTREACHED */
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_alloc.c b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
new file mode 100644
index 00000000000..f7344032a15
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_alloc.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * There's no malloc interface, WiredTiger never calls malloc.
+ *
+ * The problem is an application might allocate memory, write secret stuff in
+ * it, free the memory, then WiredTiger allocates the memory and uses it for a
+ * file page or log record, then writes it to disk, without having overwritten
+ * it fully. That results in the secret stuff being protected by WiredTiger's
+ * permission mechanisms, potentially inappropriate for the secret stuff.
+ */
+
+/*
+ * __wt_calloc --
+ * ANSI calloc function.
+ */
+int
+__wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
+{
+ void *p;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ WT_ASSERT(session, number != 0 && size != 0);
+
+ if (session != NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+ if ((p = calloc(number, size)) == NULL)
+ WT_RET_MSG(session, __wt_errno(), "memory allocation");
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_realloc --
+ * ANSI realloc function.
+ */
+int
+__wt_realloc(WT_SESSION_IMPL *session,
+ size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+ void *p;
+ size_t bytes_allocated;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ *
+ * Sometimes we're allocating memory and we don't care about the
+ * final length -- bytes_allocated_ret may be NULL.
+ */
+ p = *(void **)retp;
+ bytes_allocated =
+ (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(session,
+ (p == NULL && bytes_allocated == 0) ||
+ (p != NULL &&
+ (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+ WT_ASSERT(session, bytes_to_allocate != 0);
+ WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
+
+ if (session != NULL) {
+ if (p == NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+ else
+ WT_STAT_FAST_CONN_INCR(session, memory_grow);
+ }
+
+ if ((p = realloc(p, bytes_to_allocate)) == NULL)
+ WT_RET_MSG(session, __wt_errno(), "memory allocation");
+
+ /*
+ * Clear the allocated memory -- an application might: allocate memory,
+ * write secret stuff into it, free the memory, then we re-allocate the
+ * memory and use it for a file page or log record, and then write it to
+ * disk. That would result in the secret stuff being protected by the
+ * WiredTiger permission mechanisms, potentially inappropriate for the
+ * secret stuff.
+ */
+ memset((uint8_t *)
+ p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);
+
+ /* Update caller's bytes allocated value. */
+ if (bytes_allocated_ret != NULL)
+ *bytes_allocated_ret = bytes_to_allocate;
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_realloc_aligned --
+ * ANSI realloc function that aligns to buffer boundaries, configured with
+ * the "buffer_alignment" key to wiredtiger_open.
+ */
+int
+__wt_realloc_aligned(WT_SESSION_IMPL *session,
+ size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+ WT_DECL_RET;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ if (session != NULL && S2C(session)->buffer_alignment > 0) {
+ void *p, *newp;
+ size_t bytes_allocated;
+
+ /*
+ * Sometimes we're allocating memory and we don't care about the
+ * final length -- bytes_allocated_ret may be NULL.
+ */
+ p = *(void **)retp;
+ bytes_allocated =
+ (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
+ WT_ASSERT(session,
+ (p == NULL && bytes_allocated == 0) ||
+ (p != NULL &&
+ (bytes_allocated_ret == NULL || bytes_allocated != 0)));
+ WT_ASSERT(session, bytes_to_allocate != 0);
+ WT_ASSERT(session, bytes_allocated < bytes_to_allocate);
+
+ if (session != NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_allocation);
+
+ if ((ret = posix_memalign(&newp,
+ S2C(session)->buffer_alignment,
+ bytes_to_allocate)) != 0)
+ WT_RET_MSG(session, ret, "memory allocation");
+
+ if (p != NULL)
+ memcpy(newp, p, bytes_allocated);
+ __wt_free(session, p);
+ p = newp;
+
+ /* Clear the allocated memory (see above). */
+ memset((uint8_t *)p + bytes_allocated, 0,
+ bytes_to_allocate - bytes_allocated);
+
+ /* Update caller's bytes allocated value. */
+ if (bytes_allocated_ret != NULL)
+ *bytes_allocated_ret = bytes_to_allocate;
+
+ *(void **)retp = p;
+ return (0);
+ }
+#endif
+ /*
+ * If there is no posix_memalign function, or no alignment configured,
+ * fall back to realloc.
+ *
+ * Windows note: Visual C CRT memalign does not match Posix behavior
+ * and would also double each allocation so it is bad for memory use
+ */
+ return (__wt_realloc(
+ session, bytes_allocated_ret, bytes_to_allocate, retp));
+}
+
+/*
+ * __wt_strndup --
+ * Duplicate a byte string of a given length (and NUL-terminate).
+ */
+int
+__wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp)
+{
+ void *p;
+
+ if (str == NULL) {
+ *(void **)retp = NULL;
+ return (0);
+ }
+
+ WT_RET(__wt_calloc(session, len + 1, 1, &p));
+
+ /*
+ * Don't change this to strncpy, we rely on this function to duplicate
+ * "strings" that contain nul bytes.
+ */
+ memcpy(p, str, len);
+
+ *(void **)retp = p;
+ return (0);
+}
+
+/*
+ * __wt_strdup --
+ * ANSI strdup function.
+ */
+int
+__wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp)
+{
+ return (__wt_strndup(
+ session, str, (str == NULL) ? 0 : strlen(str), retp));
+}
+
+/*
+ * __wt_free_int --
+ * ANSI free function.
+ */
+void
+__wt_free_int(WT_SESSION_IMPL *session, const void *p_arg)
+{
+ void *p;
+
+ p = *(void **)p_arg;
+ if (p == NULL) /* ANSI C free semantics */
+ return;
+
+ /*
+ * If there's a serialization bug we might race with another thread.
+ * We can't avoid the race (and we aren't willing to flush memory),
+ * but we minimize the window by clearing the free address, hoping a
+ * racing thread will see, and won't free, a NULL pointer.
+ */
+ *(void **)p_arg = NULL;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ */
+ if (session != NULL)
+ WT_STAT_FAST_CONN_INCR(session, memory_free);
+
+ free(p);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dir.c b/src/third_party/wiredtiger/src/os_posix/os_dir.c
new file mode 100644
index 00000000000..98b2d4926cd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_dir.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+/* I'm sure we need to config this */
+#include <dirent.h>
+
+/*
+ * __wt_dirlist --
+ * Get a list of files from a directory, optionally filtered by
+ * a given prefix.
+ */
+int
+__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
+ uint32_t flags, char ***dirlist, u_int *countp)
+{
+ struct dirent *dp;
+ DIR *dirp;
+ WT_DECL_RET;
+ size_t dirallocsz;
+ u_int count, dirsz;
+ int match;
+ char **entries, *path;
+
+ *dirlist = NULL;
+ *countp = 0;
+
+ WT_RET(__wt_filename(session, dir, &path));
+
+ dirp = NULL;
+ dirallocsz = 0;
+ dirsz = 0;
+ entries = NULL;
+ if (flags == 0)
+ LF_SET(WT_DIRLIST_INCLUDE);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
+ "wt_dirlist of %s %s prefix %s",
+ path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
+ prefix == NULL ? "all" : prefix));
+
+ WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "%s: opendir", path);
+ for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) {
+ /*
+ * Skip . and ..
+ */
+ if (strcmp(dp->d_name, ".") == 0 ||
+ strcmp(dp->d_name, "..") == 0)
+ continue;
+ match = 0;
+ if (prefix != NULL &&
+ ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
+ WT_PREFIX_MATCH(dp->d_name, prefix)) ||
+ (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
+ !WT_PREFIX_MATCH(dp->d_name, prefix))))
+ match = 1;
+ if (prefix == NULL || match) {
+ /*
+ * We have a file name we want to return.
+ */
+ count++;
+ if (count > dirsz) {
+ dirsz += WT_DIR_ENTRY;
+ WT_ERR(__wt_realloc_def(
+ session, &dirallocsz, dirsz, &entries));
+ }
+ WT_ERR(__wt_strdup(
+ session, dp->d_name, &entries[count-1]));
+ }
+ }
+ if (count > 0)
+ *dirlist = entries;
+ *countp = count;
+err:
+ if (dirp != NULL)
+ (void)closedir(dirp);
+ __wt_free(session, path);
+
+ if (ret == 0)
+ return (0);
+
+ if (*dirlist != NULL) {
+ for (count = dirsz; count > 0; count--)
+ __wt_free(session, entries[count]);
+ __wt_free(session, entries);
+ }
+ WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_dlopen.c b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
new file mode 100644
index 00000000000..91410c54c04
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_dlopen.c
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dlopen --
+ * Open a dynamic library.
+ */
+int
+__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
+{
+ WT_DECL_RET;
+ WT_DLH *dlh;
+
+ WT_RET(__wt_calloc_def(session, 1, &dlh));
+ WT_ERR(__wt_strdup(session, path, &dlh->name));
+
+ if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
+ WT_ERR_MSG(
+ session, __wt_errno(), "dlopen(%s): %s", path, dlerror());
+
+ *dlhp = dlh;
+ if (0) {
+err: __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_dlsym --
+ * Lookup a symbol in a dynamic library.
+ */
+int
+__wt_dlsym(WT_SESSION_IMPL *session,
+ WT_DLH *dlh, const char *name, int fail, void *sym_ret)
+{
+ void *sym;
+
+ *(void **)sym_ret = NULL;
+ if ((sym = dlsym(dlh->handle, name)) == NULL) {
+ if (fail)
+ WT_RET_MSG(session, __wt_errno(),
+ "dlsym(%s in %s): %s", name, dlh->name, dlerror());
+ return (0);
+ }
+
+ *(void **)sym_ret = sym;
+ return (0);
+}
+
+/*
+ * __wt_dlclose --
+ * Close a dynamic library
+ */
+int
+__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
+{
+ WT_DECL_RET;
+
+ /*
+ * FreeBSD dies inside __cxa_finalize when closing handles.
+ *
+ * For now, just skip the dlclose: this may leak some resources until
+ * the process exits, but that is preferable to hard-to-debug crashes
+ * during exit.
+ */
+#ifndef __FreeBSD__
+ if (dlclose(dlh->handle) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "dlclose: %s", dlerror());
+ }
+#endif
+
+ __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_errno.c b/src/third_party/wiredtiger/src/os_posix/os_errno.c
new file mode 100644
index 00000000000..9290f7d651f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_errno.c
@@ -0,0 +1,22 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_errno --
+ * Return errno, or WT_ERROR if errno not set.
+ */
+int
+__wt_errno(void)
+{
+ /*
+ * Called when we know an error occurred, and we want the system
+ * error code, but there's some chance it's not set.
+ */
+ return (errno == 0 ? WT_ERROR : errno);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_exist.c b/src/third_party/wiredtiger/src/os_posix/os_exist.c
new file mode 100644
index 00000000000..723f07026e1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_exist.c
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_exist --
+ * Return if the file exists.
+ */
+int
+__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
+{
+ struct stat sb;
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ WT_SYSCALL_RETRY(stat(path, &sb), ret);
+
+ __wt_free(session, path);
+
+ if (ret == 0) {
+ *existp = 1;
+ return (0);
+ }
+ if (ret == ENOENT) {
+ *existp = 0;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s: fstat", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fallocate.c b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
new file mode 100644
index 00000000000..28cd1979c77
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_fallocate.c
@@ -0,0 +1,97 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#if defined(HAVE_FALLOCATE)
+#include <linux/falloc.h>
+#endif
+
+/*
+ * __wt_fallocate_config --
+ * Configure fallocate behavior for a file handle.
+ */
+void
+__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_UNUSED(session);
+
+ fh->fallocate_available = 0;
+ fh->fallocate_requires_locking = 0;
+
+#ifdef __linux__
+ /*
+ * We've seen Linux systems where posix_fallocate corrupts existing data
+ * (even though that is explicitly disallowed by POSIX). We've not seen
+ * problems with fallocate, it's unlocked for now.
+ */
+#if defined(HAVE_FALLOCATE)
+ fh->fallocate_available = 1;
+ fh->fallocate_requires_locking = 0;
+#elif defined(HAVE_POSIX_FALLOCATE)
+ fh->fallocate_available = 1;
+ fh->fallocate_requires_locking = 1;
+#endif
+#elif defined(HAVE_POSIX_FALLOCATE)
+ /*
+ * FreeBSD and Solaris support posix_fallocate, and so far we've seen
+ * no problems leaving it unlocked.
+ */
+ fh->fallocate_available = 1;
+ fh->fallocate_requires_locking = 0;
+#endif
+}
+
+/*
+ * __wt_fallocate --
+ * Allocate space for a file handle.
+ */
+int
+__wt_fallocate(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+ WT_DECL_RET;
+
+#if defined(HAVE_FALLOCATE)
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: fallocate", fh->name));
+ WT_SYSCALL_RETRY(
+ fallocate(fh->fd, FALLOC_FL_KEEP_SIZE, offset, len), ret);
+ if (ret == 0)
+ return (0);
+
+ /*
+ * Linux returns ENOTSUP for fallocate on some file systems; we return
+ * ENOTSUP, and our caller should avoid calling us again.
+ */
+ if (ret != ENOTSUP)
+ WT_RET_MSG(session, ret, "%s: fallocate", fh->name);
+#elif defined(HAVE_POSIX_FALLOCATE)
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: posix_fallocate", fh->name));
+ WT_SYSCALL_RETRY(posix_fallocate(fh->fd, offset, len), ret);
+ if (ret == 0)
+ return (0);
+
+ /*
+ * Solaris returns EINVAL for posix_fallocate on some file systems; we
+ * return ENOTSUP, and our caller should avoid calling us again.
+ */
+ if (ret != EINVAL)
+ WT_RET_MSG(session, ret, "%s: posix_fallocate", fh->name);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(fh);
+ WT_UNUSED(offset);
+ WT_UNUSED(len);
+ WT_UNUSED(ret);
+#endif
+
+ fh->fallocate_available = 0;
+ fh->fallocate_requires_locking = 0;
+ return (ENOTSUP);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_filesize.c b/src/third_party/wiredtiger/src/os_posix/os_filesize.c
new file mode 100644
index 00000000000..3692b135d73
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_filesize.c
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filesize --
+ * Get the size of a file in bytes.
+ */
+int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+ struct stat sb;
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fstat", fh->name));
+
+ WT_SYSCALL_RETRY(fstat(fh->fd, &sb), ret);
+ if (ret == 0) {
+ *sizep = sb.st_size;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s: fstat", fh->name);
+}
+
+/*
+ * __wt_filesize_name --
+ * Return the size of a file in bytes, given a file name.
+ */
+int
+__wt_filesize_name(
+ WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
+{
+ struct stat sb;
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ WT_SYSCALL_RETRY(stat(path, &sb), ret);
+
+ __wt_free(session, path);
+
+ if (ret == 0) {
+ *sizep = sb.st_size;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s: fstat", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_flock.c b/src/third_party/wiredtiger/src/os_posix/os_flock.c
new file mode 100644
index 00000000000..e9e653d73e6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_flock.c
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bytelock --
+ * Lock/unlock a byte in a file.
+ */
+int
+__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock)
+{
+ struct flock fl;
+ WT_DECL_RET;
+
+ /*
+ * WiredTiger requires this function be able to acquire locks past
+ * the end of file.
+ *
+ * Note we're using fcntl(2) locking: all fcntl locks associated with a
+ * file for a given process are removed when any file descriptor for the
+ * file is closed by the process, even if a lock was never requested for
+ * that file descriptor.
+ */
+ fl.l_start = byte;
+ fl.l_len = 1;
+ fl.l_type = lock ? F_WRLCK : F_UNLCK;
+ fl.l_whence = SEEK_SET;
+
+ WT_SYSCALL_RETRY(fcntl(fhp->fd, F_SETLK, &fl), ret);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_fsync.c b/src/third_party/wiredtiger/src/os_posix/os_fsync.c
new file mode 100644
index 00000000000..c181809df95
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_fsync.c
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ * Flush a file handle.
+ */
+int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fsync", fh->name));
+
+#ifdef HAVE_FDATASYNC
+ WT_SYSCALL_RETRY(fdatasync(fh->fd), ret);
+#else
+ WT_SYSCALL_RETRY(fsync(fh->fd), ret);
+#endif
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "%s fsync error", fh->name);
+
+ return (0);
+}
+
+/*
+ * __wt_fsync_async --
+ * Flush a file handle and don't wait for the result.
+ */
+int
+__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+#ifdef HAVE_SYNC_FILE_RANGE
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name));
+
+ if ((ret = sync_file_range(fh->fd,
+ (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "%s: sync_file_range", fh->name);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(fh);
+ return (0);
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c
new file mode 100644
index 00000000000..3f3034de551
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_ftruncate.c
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ftruncate --
+ * Truncate a file.
+ */
+int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+ WT_DECL_RET;
+
+ WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret);
+ if (ret == 0) {
+ fh->size = fh->extend_size = len;
+ return (0);
+ }
+
+ WT_RET_MSG(session, ret, "%s ftruncate error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_getline.c b/src/third_party/wiredtiger/src/os_posix/os_getline.c
new file mode 100644
index 00000000000..7ef4065ac3b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_getline.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_getline --
+ * Get a line from a stream.
+ *
+ * Implementation of the POSIX getline or BSD fgetln functions (finding the
+ * function in a portable way is hard, it's simple enough to write it instead).
+ *
+ * Note: Unlike the standard getline calls, this function doesn't include the
+ * trailing newline character in the returned buffer and discards empty lines
+ * (so the caller's EOF marker is a returned line length of 0).
+ */
+int
+__wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, FILE *fp)
+{
+ int c;
+
+ /*
+ * We always NUL-terminate the returned string (even if it's empty),
+ * make sure there's buffer space for a trailing NUL in all cases.
+ */
+ WT_RET(__wt_buf_init(session, buf, 100));
+
+ while ((c = fgetc(fp)) != EOF) {
+ /* Leave space for a trailing NUL. */
+ WT_RET(__wt_buf_extend(session, buf, buf->size + 2));
+ if (c == '\n') {
+ if (buf->size == 0)
+ continue;
+ break;
+ }
+ ((char *)buf->mem)[buf->size++] = (char)c;
+ }
+ if (c == EOF && ferror(fp))
+ WT_RET_MSG(session, __wt_errno(), "file read");
+
+ ((char *)buf->mem)[buf->size] = '\0';
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_getopt.c b/src/third_party/wiredtiger/src/os_posix/os_getopt.c
new file mode 100644
index 00000000000..1c25521dacd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_getopt.c
@@ -0,0 +1,150 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* $NetBSD: getopt.c,v 1.26 2003/08/07 16:43:40 agc Exp $ */
+
+/*
+ * Copyright (c) 1987, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "wt_internal.h"
+
+extern int __wt_opterr, __wt_optind, __wt_optopt, __wt_optreset;
+int __wt_opterr = 1, /* if error message should be printed */
+ __wt_optind = 1, /* index into parent argv vector */
+ __wt_optopt, /* character checked for validity */
+ __wt_optreset; /* reset getopt */
+
+extern char *__wt_optarg;
+char *__wt_optarg; /* argument associated with option */
+
+#define BADCH (int)'?'
+#define BADARG (int)':'
+#define EMSG ""
+
+/*
+ * __wt_getopt --
+ * Parse argc/argv argument vector.
+ */
+int
+__wt_getopt(
+ const char *progname, int nargc, char * const *nargv, const char *ostr)
+{
+ static const char *place = EMSG; /* option letter processing */
+ const char *oli; /* option letter list index */
+
+ if (__wt_optreset || *place == 0) { /* update scanning pointer */
+ __wt_optreset = 0;
+ place = nargv[__wt_optind];
+ if (__wt_optind >= nargc || *place++ != '-') {
+ /* Argument is absent or is not an option */
+ place = EMSG;
+ return (-1);
+ }
+ __wt_optopt = *place++;
+ if (__wt_optopt == '-' && *place == 0) {
+ /* "--" => end of options */
+ ++__wt_optind;
+ place = EMSG;
+ return (-1);
+ }
+ if (__wt_optopt == 0) {
+ /* Solitary '-', treat as a '-' option
+ if the program (eg su) is looking for it. */
+ place = EMSG;
+ if (strchr(ostr, '-') == NULL)
+ return (-1);
+ __wt_optopt = '-';
+ }
+ } else
+ __wt_optopt = *place++;
+
+ /* See if option letter is one the caller wanted... */
+ if (__wt_optopt == ':' || (oli = strchr(ostr, __wt_optopt)) == NULL) {
+ if (*place == 0)
+ ++__wt_optind;
+ if (__wt_opterr && *ostr != ':')
+ (void)fprintf(stderr,
+ "%s: illegal option -- %c\n", progname,
+ __wt_optopt);
+ return (BADCH);
+ }
+
+ /* Does this option need an argument? */
+ if (oli[1] != ':') {
+ /* don't need argument */
+ __wt_optarg = NULL;
+ if (*place == 0)
+ ++__wt_optind;
+ } else {
+ /* Option-argument is either the rest of this argument or the
+ entire next argument. */
+ if (*place)
+ __wt_optarg = (char *)place;
+ else if (nargc > ++__wt_optind)
+ __wt_optarg = nargv[__wt_optind];
+ else {
+ /* option-argument absent */
+ place = EMSG;
+ if (*ostr == ':')
+ return (BADARG);
+ if (__wt_opterr)
+ (void)fprintf(stderr,
+ "%s: option requires an argument -- %c\n",
+ progname, __wt_optopt);
+ return (BADCH);
+ }
+ place = EMSG;
+ ++__wt_optind;
+ }
+ return (__wt_optopt); /* return option letter */
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_map.c b/src/third_party/wiredtiger/src/os_posix/os_map.c
new file mode 100644
index 00000000000..be4d27e96a3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_map.c
@@ -0,0 +1,136 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mmap --
+ * Map a file into memory.
+ */
+int
+__wt_mmap(WT_SESSION_IMPL *session,
+ WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie)
+{
+ void *map;
+ size_t orig_size;
+
+ WT_UNUSED(mappingcookie);
+
+ /*
+ * Record the current size and only map and set that as the length, it
+ * could change between the map call and when we set the return length.
+ * For the same reason we could actually map past the end of the file;
+ * we don't read bytes past the end of the file though, so as long as
+ * the map call succeeds, it's all OK.
+ */
+ orig_size = (size_t)fh->size;
+ if ((map = mmap(NULL, orig_size,
+ PROT_READ,
+#ifdef MAP_NOCORE
+ MAP_NOCORE |
+#endif
+ MAP_PRIVATE,
+ fh->fd, (wt_off_t)0)) == MAP_FAILED) {
+ WT_RET_MSG(session, __wt_errno(),
+ "%s map error: failed to map %" WT_SIZET_FMT " bytes",
+ fh->name, orig_size);
+ }
+ (void)__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: map %p: %" WT_SIZET_FMT " bytes", fh->name, map, orig_size);
+
+ *(void **)mapp = map;
+ *lenp = orig_size;
+ return (0);
+}
+
+#define WT_VM_PAGESIZE 4096
+
+/*
+ * __wt_mmap_preload --
+ * Cause a section of a memory map to be faulted in.
+ */
+int
+__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
+{
+#ifdef HAVE_POSIX_MADVISE
+ /* Linux requires the address be aligned to a 4KB boundary. */
+ WT_BM *bm = S2BT(session)->bm;
+ WT_DECL_RET;
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ size += WT_PTRDIFF(p, blk);
+
+ /* XXX proxy for "am I doing a scan?" -- manual read-ahead */
+ if (F_ISSET(session, WT_SESSION_NO_CACHE)) {
+ /* Read in 2MB blocks every 1MB of data. */
+ if (((uintptr_t)((uint8_t *)blk + size) &
+ (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk)
+ return (0);
+ size = WT_MIN(WT_MAX(20 * size, 2 << 20),
+ WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk));
+ }
+
+ /*
+ * Manual pages aren't clear on whether alignment is required for the
+ * size, so we will be conservative.
+ */
+ size &= ~(size_t)(WT_VM_PAGESIZE - 1);
+
+ if (size > WT_VM_PAGESIZE &&
+ (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
+ WT_RET_MSG(session, ret, "posix_madvise will need");
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+#endif
+
+ return (0);
+}
+
+/*
+ * __wt_mmap_discard --
+ * Discard a chunk of the memory map.
+ */
+int
+__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
+{
+#ifdef HAVE_POSIX_MADVISE
+ /* Linux requires the address be aligned to a 4KB boundary. */
+ WT_DECL_RET;
+ void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
+ size += WT_PTRDIFF(p, blk);
+
+ if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) != 0)
+ WT_RET_MSG(session, ret, "posix_madvise don't need");
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+#endif
+ return (0);
+}
+
+/*
+ * __wt_munmap --
+ * Remove a memory mapping.
+ */
+int
+__wt_munmap(WT_SESSION_IMPL *session,
+ WT_FH *fh, void *map, size_t len, void **mappingcookie)
+{
+ WT_UNUSED(mappingcookie);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: unmap %p: %" WT_SIZET_FMT " bytes", fh->name, map, len));
+
+ if (munmap(map, len) == 0)
+ return (0);
+
+ WT_RET_MSG(session, __wt_errno(),
+ "%s unmap error: failed to unmap %" WT_SIZET_FMT " bytes",
+ fh->name, len);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
new file mode 100644
index 00000000000..3a76cceb3f0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_cond.c
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cond_alloc --
+ * Allocate and initialize a condition variable.
+ */
+int
+__wt_cond_alloc(WT_SESSION_IMPL *session,
+ const char *name, int is_signalled, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+ WT_DECL_RET;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+
+ WT_ERR(pthread_mutex_init(&cond->mtx, NULL));
+
+ /* Initialize the condition variable to permit self-blocking. */
+ WT_ERR(pthread_cond_init(&cond->cond, NULL));
+
+ cond->name = name;
+ cond->waiters = is_signalled ? -1 : 0;
+
+ *condp = cond;
+ return (0);
+
+err: __wt_free(session, cond);
+ return (ret);
+}
+
+/*
+ * __wt_cond_wait --
+ * Wait on a mutex, optionally timing out.
+ */
+int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
+{
+ struct timespec ts;
+ WT_DECL_RET;
+ int locked;
+
+ locked = 0;
+ WT_ASSERT(session, usecs >= 0);
+
+ /* Fast path if already signalled. */
+ if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+ return (0);
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "wait %s cond (%p)", cond->name, cond));
+ WT_STAT_FAST_CONN_INCR(session, cond_wait);
+ }
+
+ WT_ERR(pthread_mutex_lock(&cond->mtx));
+ locked = 1;
+
+ if (usecs > 0) {
+ WT_ERR(__wt_epoch(session, &ts));
+ ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION;
+ ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION;
+ ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts);
+ } else
+ ret = pthread_cond_wait(&cond->cond, &cond->mtx);
+
+ /*
+ * Check pthread_cond_wait() return for EINTR, ETIME and
+ * ETIMEDOUT, some systems return these errors.
+ */
+ if (ret == EINTR ||
+#ifdef ETIME
+ ret == ETIME ||
+#endif
+ ret == ETIMEDOUT)
+ ret = 0;
+
+ (void)WT_ATOMIC_SUB4(cond->waiters, 1);
+
+err: if (locked)
+ WT_TRET(pthread_mutex_unlock(&cond->mtx));
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "pthread_cond_wait");
+}
+
+/*
+ * __wt_cond_signal --
+ * Signal a waiting thread.
+ */
+int
+__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+ WT_DECL_RET;
+ int locked;
+
+ locked = 0;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL)
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "signal %s cond (%p)", cond->name, cond));
+
+ /* Fast path if already signalled. */
+ if (cond->waiters == -1)
+ return (0);
+
+ if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+ WT_ERR(pthread_mutex_lock(&cond->mtx));
+ locked = 1;
+ WT_ERR(pthread_cond_broadcast(&cond->cond));
+ }
+
+err: if (locked)
+ WT_TRET(pthread_mutex_unlock(&cond->mtx));
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "pthread_cond_broadcast");
+}
+
+/*
+ * __wt_cond_destroy --
+ * Destroy a condition variable.
+ */
+int
+__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+ WT_DECL_RET;
+
+ cond = *condp;
+ if (cond == NULL)
+ return (0);
+
+ ret = pthread_cond_destroy(&cond->cond);
+ WT_TRET(pthread_mutex_destroy(&cond->mtx));
+ __wt_free(session, *condp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
new file mode 100644
index 00000000000..1a692f71dce
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -0,0 +1,227 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
+ * http://locklessinc.com/articles/locks/
+ *
+ * Dr. Fuerst further credits:
+ * There exists a form of the ticket lock that is designed for read-write
+ * locks. An example written in assembly was posted to the Linux kernel mailing
+ * list in 2002 by David Howells from RedHat. This was a highly optimized
+ * version of a read-write ticket lock developed at IBM in the early 90's by
+ * Joseph Seigh. Note that a similar (but not identical) algorithm was published
+ * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
+ * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rwlock_alloc --
+ * Allocate and initialize a read/write lock.
+ */
+int
+__wt_rwlock_alloc(
+ WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
+{
+ WT_RWLOCK *rwlock;
+
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
+
+ WT_RET(__wt_calloc_def(session, 1, &rwlock));
+
+ rwlock->name = name;
+
+ *rwlockp = rwlock;
+ return (0);
+}
+
+/*
+ * __wt_try_readlock --
+ * Try to get a shared lock, fail immediately if unavailable.
+ */
+int
+__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t old, new, pad, users, writers;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+ l = &rwlock->rwlock;
+ pad = l->s.pad;
+ users = l->s.users;
+ writers = l->s.writers;
+ old = (pad << 48) + (users << 32) + (users << 16) + writers;
+ new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
+ return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY);
+}
+
+/*
+ * __wt_readlock --
+ * Get a shared lock.
+ */
+int
+__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t me;
+ uint16_t val;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+ l = &rwlock->rwlock;
+ me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
+ val = (uint16_t)(me >> 32);
+ while (val != l->s.readers)
+ WT_PAUSE();
+
+ ++l->s.readers;
+
+ return (0);
+}
+
+/*
+ * __wt_readunlock --
+ * Release a shared lock.
+ */
+int
+__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
+
+ l = &rwlock->rwlock;
+ WT_ATOMIC_ADD2(l->s.writers, 1);
+
+ return (0);
+}
+
+/*
+ * __wt_try_writelock --
+ * Try to get an exclusive lock, fail immediately if unavailable.
+ */
+int
+__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t old, new, pad, readers, users;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ l = &rwlock->rwlock;
+ pad = l->s.pad;
+ readers = l->s.readers;
+ users = l->s.users;
+ old = (pad << 48) + (users << 32) + (readers << 16) + users;
+ new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
+ return (WT_ATOMIC_CAS_VAL8(l->u, old, new) == old ? 0 : EBUSY);
+}
+
+/*
+ * __wt_writelock --
+ * Wait to get an exclusive lock.
+ */
+int
+__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+ uint64_t me;
+ uint16_t val;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ /*
+ * Possibly wrap: if we have more than 64K lockers waiting, the count
+ * of writers will wrap and two lockers will simultaneously be granted
+ * the write lock.
+ */
+ l = &rwlock->rwlock;
+ me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
+ val = (uint16_t)(me >> 32);
+ while (val != l->s.writers)
+ WT_PAUSE();
+
+ return (0);
+}
+
+/*
+ * __wt_writeunlock --
+ * Release an exclusive lock.
+ */
+int
+__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l, copy;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name));
+
+ l = &rwlock->rwlock;
+
+ copy = *l;
+
+ WT_BARRIER();
+
+ ++copy.s.writers;
+ ++copy.s.readers;
+
+ l->us = copy.us;
+ return (0);
+}
+
+/*
+ * __wt_rwlock_destroy --
+ * Destroy a read/write lock.
+ */
+int
+__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
+{
+ WT_RWLOCK *rwlock;
+
+ rwlock = *rwlockp; /* Clear our caller's reference. */
+ if (rwlock == NULL)
+ return (0);
+ *rwlockp = NULL;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name));
+
+ __wt_free(session, rwlock);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_once.c b/src/third_party/wiredtiger/src/os_posix/os_once.c
new file mode 100644
index 00000000000..22eaf5f0ee5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_once.c
@@ -0,0 +1,20 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_once --
+ * One-time initialization per process.
+ */
+int
+__wt_once(void (*init_routine)(void))
+{
+ static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+
+ return (pthread_once(&once_control, init_routine));
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_open.c b/src/third_party/wiredtiger/src/os_posix/os_open.c
new file mode 100644
index 00000000000..a1bc3feb7d2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_open.c
@@ -0,0 +1,253 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __open_directory_sync --
+ * Fsync the directory in which we created the file.
+ */
+static int
+__open_directory_sync(WT_SESSION_IMPL *session, char *path)
+{
+#ifdef __linux__
+ WT_DECL_RET;
+ int fd;
+ char *dir;
+
+ /*
+ * According to the Linux fsync man page:
+ * Calling fsync() does not necessarily ensure that the entry in
+ * the directory containing the file has also reached disk. For
+ * that an explicit fsync() on a file descriptor for the directory
+ * is also needed.
+ *
+ * Open the WiredTiger home directory and sync it, I don't want the rest
+ * of the system to have to wonder if opening a file creates it.
+ */
+ if ((dir = strrchr(path, '/')) == NULL)
+ path = (char *)".";
+ else
+ *dir = '\0';
+ WT_SYSCALL_RETRY(((fd =
+ open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret);
+ if (dir != NULL)
+ *dir = '/';
+ if (ret != 0)
+ WT_RET_MSG(session, ret, "%s: open", path);
+
+ WT_SYSCALL_RETRY(fsync(fd), ret);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret, "%s: fsync", path);
+
+err: WT_SYSCALL_RETRY(close(fd), ret);
+ if (ret != 0)
+ __wt_err(session, ret, "%s: close", path);
+ return (ret);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(path);
+ return (0);
+#endif
+}
+
+/*
+ * __wt_open --
+ * Open a file handle.
+ */
+int
+__wt_open(WT_SESSION_IMPL *session,
+ const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *fh, *tfh;
+ mode_t mode;
+ int direct_io, f, fd, matched;
+ char *path;
+
+ conn = S2C(session);
+ fh = NULL;
+ fd = -1;
+ path = NULL;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));
+
+ /* Increment the reference count if we already have the file open. */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched)
+ return (0);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ f = O_RDWR;
+#ifdef O_BINARY
+ /* Windows clones: we always want to treat the file as a binary. */
+ f |= O_BINARY;
+#endif
+#ifdef O_CLOEXEC
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles.
+ */
+ f |= O_CLOEXEC;
+#endif
+#ifdef O_NOATIME
+ /* Avoid updating metadata for read-only workloads. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ f |= O_NOATIME;
+#endif
+
+ if (ok_create) {
+ f |= O_CREAT;
+ if (exclusive)
+ f |= O_EXCL;
+ mode = 0666;
+ } else
+ mode = 0;
+
+ direct_io = 0;
+#ifdef O_DIRECT
+ if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
+ f |= O_DIRECT;
+ direct_io = 1;
+ }
+#endif
+ if (dio_type == WT_FILE_TYPE_LOG &&
+ FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
+#ifdef O_DSYNC
+ f |= O_DSYNC;
+#elif defined(O_SYNC)
+ f |= O_SYNC;
+#else
+ WT_ERR_MSG(session, ENOTSUP,
+ "Unsupported log sync mode requested");
+#endif
+ WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret);
+ if (ret != 0)
+ WT_ERR_MSG(session, ret,
+ direct_io ?
+ "%s: open failed with direct I/O configured, some "
+ "filesystem types do not support direct I/O" : "%s", path);
+
+#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles. There's an obvious
+ * race here, so we prefer the flag to open if available.
+ */
+ if ((f = fcntl(fd, F_GETFD)) == -1 ||
+ fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
+ WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name);
+#endif
+
+#if defined(HAVE_POSIX_FADVISE)
+ /* Disable read-ahead on trees: it slows down random read workloads. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM));
+#endif
+
+ if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
+ WT_ERR(__open_directory_sync(session, path));
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_strdup(session, name, &fh->name));
+ fh->fd = fd;
+ fh->ref = 1;
+ fh->direct_io = direct_io;
+
+ /* Set the file's size. */
+ WT_ERR(__wt_filesize(session, fh, &fh->size));
+
+ /* Configure file extension. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ fh->extend_len = conn->data_extend_len;
+
+ /* Configure fallocate/posix_fallocate calls. */
+ __wt_fallocate_config(session, fh);
+
+ /*
+ * Repeat the check for a match, but then link onto the database's list
+ * of files.
+ */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ if (!matched) {
+ TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_INCR(session, file_open);
+
+ *fhp = fh;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched) {
+err: if (fh != NULL) {
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ }
+ if (fd != -1)
+ (void)close(fd);
+ }
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_close --
+ * Close a file handle.
+ */
+int
+__wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ __wt_spin_lock(session, &conn->fh_lock);
+ if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
+ __wt_spin_unlock(session, &conn->fh_lock);
+ return (0);
+ }
+
+ /* Remove from the list. */
+ TAILQ_REMOVE(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_DECR(session, file_open);
+
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ /* Discard the memory. */
+ if (close(fh->fd) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "close: %s", fh->name);
+ }
+
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_path.c b/src/third_party/wiredtiger/src/os_posix/os_path.c
new file mode 100644
index 00000000000..aed99d1d027
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_path.c
@@ -0,0 +1,28 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_absolute_path --
+ * Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+ return (path[0] == '/' ? 1 : 0);
+}
+
+/*
+ * __wt_path_separator --
+ * Return the path separator string.
+ */
+const char *
+__wt_path_separator(void)
+{
+ return ("/");
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_priv.c b/src/third_party/wiredtiger/src/os_posix/os_priv.c
new file mode 100644
index 00000000000..7d56359da4f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_priv.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_has_priv --
+ * Return if the process has special privileges, defined as having
+ * different effective and read UIDs or GIDs.
+ */
+int
+__wt_has_priv(void)
+{
+ return (getuid() != geteuid() || getgid() != getegid());
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_remove.c b/src/third_party/wiredtiger/src/os_posix/os_remove.c
new file mode 100644
index 00000000000..a52a4db6bc7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_remove.c
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __remove_file_check --
+ * Check if the file is currently open before removing it.
+ */
+static void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
+{
+#ifdef HAVE_DIAGNOSTIC
+ WT_CONNECTION_IMPL *conn;
+ WT_FH *fh;
+
+ conn = S2C(session);
+ fh = NULL;
+
+ /*
+ * Check if the file is open: it's an error if it is, since a higher
+ * level should have closed it before removing.
+ */
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
+ if (strcmp(name, fh->name) == 0)
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ WT_ASSERT(session, fh == NULL);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ * Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name));
+
+ __remove_file_check(session, name);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ WT_SYSCALL_RETRY(remove(path), ret);
+
+ __wt_free(session, path);
+
+ if (ret == 0 || ret == ENOENT)
+ return (0);
+
+ WT_RET_MSG(session, ret, "%s: remove", name);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_rename.c b/src/third_party/wiredtiger/src/os_posix/os_rename.c
new file mode 100644
index 00000000000..ddbb59aaf37
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_rename.c
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rename --
+ * Rename a file.
+ */
+int
+__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+ WT_DECL_RET;
+ char *from_path, *to_path;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "rename %s to %s", from, to));
+
+ from_path = to_path = NULL;
+
+ WT_RET(__wt_filename(session, from, &from_path));
+ WT_TRET(__wt_filename(session, to, &to_path));
+
+ if (ret == 0)
+ WT_SYSCALL_RETRY(rename(from_path, to_path), ret);
+
+ __wt_free(session, from_path);
+ __wt_free(session, to_path);
+
+ if (ret == 0)
+ return (0);
+
+ WT_RET_MSG(session, ret, "rename %s to %s", from, to);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_rw.c b/src/third_party/wiredtiger/src/os_posix/os_rw.c
new file mode 100644
index 00000000000..4247fb30fd1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_rw.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ * Read a chunk.
+ */
+int
+__wt_read(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+ size_t chunk;
+ ssize_t nr;
+ uint8_t *addr;
+
+ WT_STAT_FAST_CONN_INCR(session, read_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break reads larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+ chunk = WT_MIN(len, WT_GIGABYTE);
+ if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0)
+ WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+ "%s read error: failed to read %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
+
+/*
+ * __wt_write --
+ * Write a chunk.
+ */
+int
+__wt_write(WT_SESSION_IMPL *session,
+ WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+ size_t chunk;
+ ssize_t nw;
+ const uint8_t *addr;
+
+ WT_STAT_FAST_CONN_INCR(session, write_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break writes larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
+ chunk = WT_MIN(len, WT_GIGABYTE);
+ if ((nw = pwrite(fh->fd, addr, chunk, offset)) < 0)
+ WT_RET_MSG(session, __wt_errno(),
+ "%s write error: failed to write %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_sleep.c b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
new file mode 100644
index 00000000000..665330a26e7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_sleep.c
@@ -0,0 +1,23 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ * Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+ struct timeval t;
+
+ t.tv_sec = seconds + micro_seconds / 1000000;
+ t.tv_usec = (suseconds_t)(micro_seconds % 1000000);
+
+ (void)select(0, NULL, NULL, NULL, &t);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_strtouq.c b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c
new file mode 100644
index 00000000000..97f9759f76f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_strtouq.c
@@ -0,0 +1,24 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_strtouq --
+ * Convert a string to an unsigned quad integer.
+ */
+uint64_t
+__wt_strtouq(const char *nptr, char **endptr, int base)
+{
+#if defined(HAVE_STRTOUQ)
+ return (strtouq(nptr, endptr, base));
+#else
+ WT_STATIC_ASSERT(sizeof(uint64_t) == sizeof(unsigned long long));
+
+ return (strtoull(nptr, endptr, base));
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_thread.c b/src/third_party/wiredtiger/src/os_posix/os_thread.c
new file mode 100644
index 00000000000..7c447710b46
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_thread.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ * Create a new thread of control.
+ */
+int
+__wt_thread_create(WT_SESSION_IMPL *session,
+ wt_thread_t *tidret, void *(*func)(void *), void *arg)
+{
+ WT_DECL_RET;
+
+ /* Spawn a new thread of control. */
+ if ((ret = pthread_create(tidret, NULL, func, arg)) == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "pthread_create");
+}
+
+/*
+ * __wt_thread_join --
+ * Wait for a thread of control to exit.
+ */
+int
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+{
+ WT_DECL_RET;
+
+ if ((ret = pthread_join(tid, NULL)) == 0)
+ return (0);
+
+ WT_RET_MSG(session, ret, "pthread_join");
+}
+
+/*
+ * __wt_thread_id --
+ * Fill in a printable version of the process and thread IDs.
+ */
+void
+__wt_thread_id(char *buf, size_t buflen)
+{
+ pthread_t self;
+
+ /*
+ * POSIX 1003.1 allows pthread_t to be an opaque type, but on systems
+ * where it's a pointer, we'd rather print out the pointer and match
+ * gdb output. Since we don't yet run on any systems where pthread_t
+ * is not a pointer, do it that way for now.
+ */
+ self = pthread_self();
+ (void)snprintf(buf, buflen,
+ "%" PRIu64 ":%p", (uint64_t)getpid(), (void *)self);
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c
new file mode 100644
index 00000000000..56f688a1e14
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_time.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ WT_RET(__wt_epoch(session, &t));
+
+ *timep = t.tv_sec;
+
+ return (0);
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch.
+ */
+int
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ WT_DECL_RET;
+
+#if defined(HAVE_CLOCK_GETTIME)
+ WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret);
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "clock_gettime");
+#elif defined(HAVE_GETTIMEOFDAY)
+ struct timeval v;
+
+ WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret);
+ if (ret == 0) {
+ tsp->tv_sec = v.tv_sec;
+ tsp->tv_nsec = v.tv_usec * 1000;
+ return (0);
+ }
+ WT_RET_MSG(session, ret, "gettimeofday");
+#else
+ NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/os_posix/os_yield.c b/src/third_party/wiredtiger/src/os_posix/os_yield.c
new file mode 100644
index 00000000000..6af30803e81
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_posix/os_yield.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ * Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+ sched_yield();
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_dir.c b/src/third_party/wiredtiger/src/os_win/os_dir.c
new file mode 100644
index 00000000000..076c64670d4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_dir.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dirlist --
+ * Get a list of files from a directory, optionally filtered by
+ * a given prefix.
+ */
+int
+__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
+ uint32_t flags, char ***dirlist, u_int *countp)
+{
+ HANDLE findhandle;
+ WIN32_FIND_DATA finddata;
+ WT_DECL_ITEM(pathbuf);
+ WT_DECL_RET;
+ size_t dirallocsz, pathlen;
+ u_int count, dirsz;
+ int match;
+ char **entries, *path;
+
+ *dirlist = NULL;
+ *countp = 0;
+
+ findhandle = INVALID_HANDLE_VALUE;
+ count = 0;
+
+ WT_RET(__wt_filename(session, dir, &path));
+
+ pathlen = strlen(path);
+ if (path[pathlen - 1] == '\\') {
+ path[pathlen - 1] = '\0';
+ }
+
+ WT_ERR(__wt_scr_alloc(session, 0, &pathbuf));
+ WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path));
+
+ dirallocsz = 0;
+ dirsz = 0;
+ entries = NULL;
+ if (flags == 0)
+ LF_SET(WT_DIRLIST_INCLUDE);
+
+ WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
+ "wt_dirlist of %s %s prefix %s",
+ pathbuf->data, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
+ prefix == NULL ? "all" : prefix));
+
+ findhandle = FindFirstFileA(pathbuf->data, &finddata);
+
+ if (INVALID_HANDLE_VALUE == findhandle)
+ WT_ERR_MSG(session, __wt_errno(), "%s: FindFirstFile",
+ pathbuf->data);
+ else {
+ do {
+ /*
+ * Skip . and ..
+ */
+ if (strcmp(finddata.cFileName, ".") == 0 ||
+ strcmp(finddata.cFileName, "..") == 0)
+ continue;
+ match = 0;
+ if (prefix != NULL &&
+ ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
+ WT_PREFIX_MATCH(finddata.cFileName, prefix)) ||
+ (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
+ !WT_PREFIX_MATCH(finddata.cFileName, prefix))))
+ match = 1;
+ if (prefix == NULL || match) {
+ /*
+ * We have a file name we want to return.
+ */
+ count++;
+ if (count > dirsz) {
+ dirsz += WT_DIR_ENTRY;
+ WT_ERR(__wt_realloc_def(session,
+ &dirallocsz, dirsz, &entries));
+ }
+ WT_ERR(__wt_strdup(session,
+ finddata.cFileName, &entries[count - 1]));
+ }
+ } while (FindNextFileA(findhandle, &finddata) != 0);
+ }
+
+ if (count > 0)
+ *dirlist = entries;
+ *countp = count;
+
+err:
+ if (findhandle != INVALID_HANDLE_VALUE)
+ (void)FindClose(findhandle);
+ __wt_free(session, path);
+ __wt_buf_free(session, pathbuf);
+
+ if (ret == 0)
+ return (0);
+
+ if (*dirlist != NULL) {
+ for (count = dirsz; count > 0; count--)
+ __wt_free(session, entries[count]);
+ __wt_free(session, entries);
+ }
+
+ WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_dlopen.c b/src/third_party/wiredtiger/src/os_win/os_dlopen.c
new file mode 100644
index 00000000000..ebc90edd2b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_dlopen.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_dlopen --
+ * Open a dynamic library.
+ */
+int
+__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
+{
+ WT_DECL_RET;
+ WT_DLH *dlh;
+
+ WT_RET(__wt_calloc_def(session, 1, &dlh));
+ WT_ERR(__wt_strdup(session, path, &dlh->name));
+
+ /* NULL means load from the current binary */
+ if (path == NULL) {
+ ret = GetModuleHandleExA(0, NULL, &dlh->handle);
+ if (ret == FALSE)
+ WT_ERR_MSG(session,
+ __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0);
+ } else {
+ // TODO: load dll here
+ DebugBreak();
+ }
+
+ /* Windows returns 0 on failure, WT expects 0 on success */
+ ret = !ret;
+
+ *dlhp = dlh;
+ if (0) {
+err: __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_dlsym --
+ * Lookup a symbol in a dynamic library.
+ */
+int
+__wt_dlsym(WT_SESSION_IMPL *session,
+ WT_DLH *dlh, const char *name, int fail, void *sym_ret)
+{
+ void *sym;
+
+ *(void **)sym_ret = NULL;
+
+ sym = GetProcAddress(dlh->handle, name);
+ if (sym == NULL && fail) {
+ WT_RET_MSG(session, __wt_errno(),
+ "GetProcAddress(%s in %s): %s", name, dlh->name, 0);
+ }
+
+ *(void **)sym_ret = sym;
+ return (0);
+}
+
+/*
+ * __wt_dlclose --
+ * Close a dynamic library
+ */
+int
+__wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh)
+{
+ WT_DECL_RET;
+
+ if ((ret = FreeLibrary(dlh->handle)) == FALSE) {
+ __wt_err(session, __wt_errno(), "FreeLibrary");
+ }
+
+ /* Windows returns 0 on failure, WT expects 0 on success */
+ ret = !ret;
+
+ __wt_free(session, dlh->name);
+ __wt_free(session, dlh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_errno.c b/src/third_party/wiredtiger/src/os_win/os_errno.c
new file mode 100644
index 00000000000..ce50106b0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_errno.c
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_errno --
+ * Return errno, or WT_ERROR if errno not set.
+ */
+int
+__wt_errno(void)
+{
+ /*
+ * Called when we know an error occurred, and we want the system
+ * error code, but there's some chance it's not set.
+ */
+ DWORD err = GetLastError();
+
+ /* GetLastError should only be called if we hit an actual error */
+ WT_ASSERT(NULL, err != ERROR_SUCCESS);
+
+ return (err == ERROR_SUCCESS ? WT_ERROR : err);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_exist.c b/src/third_party/wiredtiger/src/os_win/os_exist.c
new file mode 100644
index 00000000000..ab3805f19df
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_exist.c
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_exist --
+ * Return if the file exists.
+ */
+int
+__wt_exist(WT_SESSION_IMPL *session, const char *filename, int *existp)
+{
+ WT_DECL_RET;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ ret = GetFileAttributesA(path);
+
+ __wt_free(session, path);
+
+ if (ret != INVALID_FILE_ATTRIBUTES)
+ *existp = 1;
+ else
+ *existp = 0;
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fallocate.c b/src/third_party/wiredtiger/src/os_win/os_fallocate.c
new file mode 100644
index 00000000000..bd71c780dc5
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_fallocate.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fallocate_config --
+ * Configure fallocate behavior for a file handle.
+ */
+void
+__wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ fh->fallocate_available = 1;
+
+ /*
+ * We use a separate handle for file size changes, so there's no need
+ * for locking.
+ */
+ fh->fallocate_requires_locking = 0;
+}
+
+/*
+ * __wt_fallocate --
+ * Allocate space for a file handle.
+ */
+int
+__wt_fallocate(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+ WT_DECL_RET;
+ LARGE_INTEGER largeint;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: fallocate", fh->name));
+
+ largeint.QuadPart = offset + len;
+
+ if ((ret = SetFilePointerEx(
+ fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE)
+ WT_RET_MSG(session,
+ __wt_errno(), "%s SetFilePointerEx error", fh->name);
+
+ if ((ret = SetEndOfFile(fh->filehandle_secondary)) != FALSE) {
+ fh->size = fh->extend_size = len;
+ return (0);
+ }
+
+ WT_RET_MSG(session, __wt_errno(), "%s SetEndOfFile error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_filesize.c b/src/third_party/wiredtiger/src/os_win/os_filesize.c
new file mode 100644
index 00000000000..309ee1db40b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_filesize.c
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filesize --
+ * Get the size of a file in bytes.
+ */
+int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+ WT_DECL_RET;
+ LARGE_INTEGER size;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: GetFileSizeEx", fh->name));
+
+ if ((ret = GetFileSizeEx(fh->filehandle, &size)) != 0) {
+ *sizep = size.QuadPart;
+ return (0);
+ }
+
+ WT_RET_MSG(session, __wt_errno(), "%s: GetFileSizeEx", fh->name);
+}
+
+/*
+ * __wt_filesize_name --
+ * Return the size of a file in bytes, given a file name.
+ */
+int
+__wt_filesize_name(
+ WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
+{
+ WT_DECL_RET;
+ WIN32_FILE_ATTRIBUTE_DATA data;
+ char *path;
+
+ WT_RET(__wt_filename(session, filename, &path));
+
+ ret = GetFileAttributesExA(path, GetFileExInfoStandard, &data);
+
+ __wt_free(session, path);
+
+ if (ret != 0) {
+ *sizep =
+ ((int64_t)data.nFileSizeHigh << 32) | data.nFileSizeLow;
+ return (0);
+ }
+
+ WT_RET_MSG(session, __wt_errno(), "%s: GetFileAttributesEx", filename);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_flock.c b/src/third_party/wiredtiger/src/os_win/os_flock.c
new file mode 100644
index 00000000000..4b3ca34d65f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_flock.c
@@ -0,0 +1,46 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bytelock --
+ * Lock/unlock a byte in a file.
+ */
+int
+__wt_bytelock(WT_FH *fhp, wt_off_t byte, int lock)
+{
+ WT_DECL_RET;
+
+ /*
+ * WiredTiger requires this function be able to acquire locks past
+ * the end of file.
+ *
+ * Note we're using fcntl(2) locking: all fcntl locks associated with a
+ * file for a given process are removed when any file descriptor for the
+ * file is closed by the process, even if a lock was never requested for
+ * that file descriptor.
+ *
+ * http://msdn.microsoft.com/
+ * en-us/library/windows/desktop/aa365202%28v=vs.85%29.aspx
+ *
+ * You can lock bytes that are beyond the end of the current file.
+ * This is useful to coordinate adding records to the end of a file.
+ */
+ if (lock) {
+ ret = LockFile(fhp->filehandle, UINT32_MAX & byte,
+ UINT32_MAX & (byte >> 32), 1, 0);
+ } else {
+ ret = UnlockFile(fhp->filehandle, UINT32_MAX & byte,
+ UINT32_MAX & (byte >> 32), 1, 0);
+ }
+
+ if (ret == FALSE)
+ WT_RET_MSG(NULL, __wt_errno(), "%s: LockFile", fhp->name);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_fsync.c b/src/third_party/wiredtiger/src/os_win/os_fsync.c
new file mode 100644
index 00000000000..cd509131649
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_fsync.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_fsync --
+ * Flush a file handle.
+ */
+int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_DECL_RET;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: FlushFileBuffers",
+ fh->name));
+
+ if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE)
+ WT_RET_MSG(session,
+ __wt_errno(), "%s FlushFileBuffers error", fh->name);
+
+ return (0);
+}
+
+/*
+ * __wt_fsync_async --
+ * Flush a file handle and don't wait for the result.
+ */
+int
+__wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(fh);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_ftruncate.c b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c
new file mode 100644
index 00000000000..5d87f1ce06a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_ftruncate.c
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ftruncate --
+ * Truncate a file.
+ */
+int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+ WT_DECL_RET;
+ LARGE_INTEGER largeint;
+ uint32_t lasterror;
+
+ largeint.QuadPart = len;
+
+ if ((ret = SetFilePointerEx(
+ fh->filehandle_secondary, largeint, NULL, FILE_BEGIN)) == FALSE)
+ WT_RET_MSG(session, __wt_errno(), "%s SetFilePointerEx error",
+ fh->name);
+
+ ret = SetEndOfFile(fh->filehandle_secondary);
+ if (ret != FALSE) {
+ fh->size = fh->extend_size = len;
+ return (0);
+ }
+
+ lasterror = GetLastError();
+
+ if (lasterror = ERROR_USER_MAPPED_FILE)
+ return (EBUSY);
+
+ WT_RET_MSG(session, lasterror, "%s SetEndOfFile error", fh->name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_map.c b/src/third_party/wiredtiger/src/os_win/os_map.c
new file mode 100644
index 00000000000..b3b4f0f7501
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_map.c
@@ -0,0 +1,106 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_mmap --
+ * Map a file into memory.
+ */
+int
+__wt_mmap(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp,
+ void** mappingcookie)
+{
+ void *map;
+ size_t orig_size;
+
+ /*
+ * Record the current size and only map and set that as the length, it
+ * could change between the map call and when we set the return length.
+ * For the same reason we could actually map past the end of the file;
+ * we don't read bytes past the end of the file though, so as long as
+ * the map call succeeds, it's all OK.
+ */
+ orig_size = (size_t)fh->size;
+ *mappingcookie =
+ CreateFileMappingA(fh->filehandle, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (*mappingcookie == NULL)
+ WT_RET_MSG(session, __wt_errno(),
+ "%s CreateFileMapping error: failed to map %"
+ WT_SIZET_FMT " bytes",
+ fh->name, orig_size);
+
+ if ((map = MapViewOfFile(
+ *mappingcookie, FILE_MAP_READ, 0, 0, orig_size)) == NULL) {
+ CloseHandle(*mappingcookie);
+ *mappingcookie = NULL;
+
+ WT_RET_MSG(session, __wt_errno(),
+ "%s map error: failed to map %" WT_SIZET_FMT " bytes",
+ fh->name, orig_size);
+ }
+ (void)__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: MapViewOfFile %p: %" WT_SIZET_FMT " bytes",
+ fh->name, map, orig_size);
+
+ *(void **)mapp = map;
+ *lenp = orig_size;
+ return (0);
+}
+
+/*
+ * __wt_mmap_preload --
+ * Cause a section of a memory map to be faulted in.
+ */
+int
+__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+
+ return (0);
+}
+
+/*
+ * __wt_mmap_discard --
+ * Discard a chunk of the memory map.
+ */
+int
+__wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size)
+{
+ WT_UNUSED(session);
+ WT_UNUSED(p);
+ WT_UNUSED(size);
+ return (0);
+}
+
+/*
+ * __wt_munmap --
+ * Remove a memory mapping.
+ */
+int
+__wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len,
+ void** mappingcookie)
+{
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: UnmapViewOfFile %p: %" WT_SIZET_FMT " bytes",
+ fh->name, map, len));
+
+ if (UnmapViewOfFile(map) == 0) {
+ WT_RET_MSG(session, __wt_errno(),
+ "%s UnmapViewOfFile error: failed to unmap %" WT_SIZET_FMT
+ " bytes",
+ fh->name, len);
+ }
+
+ CloseHandle(*mappingcookie);
+
+ *mappingcookie = 0;
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
new file mode 100644
index 00000000000..9c9907bd8be
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_cond.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_cond_alloc --
+ * Allocate and initialize a condition variable.
+ */
+int
+__wt_cond_alloc(WT_SESSION_IMPL *session,
+ const char *name, int is_signalled, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));
+
+ InitializeCriticalSection(&cond->mtx);
+
+ /* Initialize the condition variable to permit self-blocking. */
+ InitializeConditionVariable(&cond->cond);
+
+ cond->name = name;
+ cond->waiters = is_signalled ? -1 : 0;
+
+ *condp = cond;
+ return (0);
+}
+
+/*
+ * __wt_cond_wait --
+ * Wait on a mutex, optionally timing out.
+ */
+int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
+{
+ WT_DECL_RET;
+ int locked;
+ int lasterror;
+ int milliseconds;
+ locked = 0;
+ WT_ASSERT(session, usecs >= 0);
+
+ /* Fast path if already signalled. */
+ if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+ return (0);
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL) {
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "wait %s cond (%p)", cond->name, cond));
+ WT_STAT_FAST_CONN_INCR(session, cond_wait);
+ }
+
+ EnterCriticalSection(&cond->mtx);
+ locked = 1;
+
+ if (usecs > 0) {
+ milliseconds = usecs / 1000;
+ /*
+ * 0 would mean the CV sleep becomes a TryCV which we do not
+ * want
+ */
+ if (milliseconds == 0)
+ milliseconds = 1;
+ ret = SleepConditionVariableCS(
+ &cond->cond, &cond->mtx, milliseconds);
+ } else
+ ret = SleepConditionVariableCS(
+ &cond->cond, &cond->mtx, INFINITE);
+
+ if (ret == 0) {
+ lasterror = GetLastError();
+ if (lasterror == ERROR_TIMEOUT) {
+ ret = 1;
+ }
+ }
+
+ (void)WT_ATOMIC_SUB4(cond->waiters, 1);
+
+ if (locked)
+ LeaveCriticalSection(&cond->mtx);
+ if (ret != 0)
+ return (0);
+ WT_RET_MSG(session, ret, "SleepConditionVariableCS");
+}
+
+/*
+ * __wt_cond_signal --
+ * Signal a waiting thread.
+ */
+int
+__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
+{
+ WT_DECL_RET;
+ int locked;
+
+ locked = 0;
+
+ /*
+ * !!!
+ * This function MUST handle a NULL session handle.
+ */
+ if (session != NULL)
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX,
+ "signal %s cond (%p)", cond->name, cond));
+
+ /* Fast path if already signalled. */
+ if (cond->waiters == -1)
+ return (0);
+
+ if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+ EnterCriticalSection(&cond->mtx);
+ locked = 1;
+ WakeAllConditionVariable(&cond->cond);
+ }
+
+ if (locked)
+ LeaveCriticalSection(&cond->mtx);
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "WakeAllConditionVariable");
+}
+
+/*
+ * __wt_cond_destroy --
+ * Destroy a condition variable.
+ */
+int
+__wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp)
+{
+ WT_CONDVAR *cond;
+ WT_DECL_RET;
+
+ cond = *condp;
+ if (cond == NULL)
+ return (0);
+
+ /* Do nothing to delete Condition Variable */
+ DeleteCriticalSection(&cond->mtx);
+ __wt_free(session, *condp);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c
new file mode 100644
index 00000000000..ec0894a2f29
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_mtx_rw.c
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rwlock_alloc --
+ * Allocate and initialize a read/write lock.
+ */
+int
+__wt_rwlock_alloc(
+ WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
+{
+ WT_RWLOCK *rwlock;
+
+ WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));
+
+ WT_RET(__wt_calloc_def(session, 1, &rwlock));
+
+ rwlock->name = name;
+ InitializeSRWLock(&rwlock->rwlock);
+
+ *rwlockp = rwlock;
+ return (0);
+}
+
+/*
+ * __wt_readlock --
+ * Get a shared lock.
+ */
+int
+__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: readlock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_read);
+
+ AcquireSRWLockShared(&rwlock->rwlock);
+
+ return (0);
+}
+
+/*
+ * __wt_readunlock --
+ * Release a shared lock.
+ */
+int
+__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
+
+ ReleaseSRWLockShared(&rwlock->rwlock);
+ return (0);
+}
+
+/*
+ * __wt_try_writelock --
+ * Try to get an exclusive lock, fail immediately if unavailable.
+ */
+int
+__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ return (TryAcquireSRWLockExclusive(&rwlock->rwlock) == 0 ? EBUSY : 0);
+}
+
+/*
+ * __wt_writelock --
+ * Wait to get an exclusive lock.
+ */
+int
+__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
+ WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+
+ AcquireSRWLockExclusive(&rwlock->rwlock);
+
+ return (0);
+}
+
+/*
+ * __wt_writeunlock --
+ * Release an exclusive lock.
+ */
+int
+__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: writeunlock %s", rwlock->name));
+
+ ReleaseSRWLockExclusive(&rwlock->rwlock);
+ return (0);
+}
+
+/*
+ * __wt_rwlock_destroy --
+ * Destroy a read/write lock.
+ */
+int
+__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
+{
+ WT_RWLOCK *rwlock;
+
+ rwlock = *rwlockp; /* Clear our caller's reference. */
+ if (rwlock == NULL)
+ return (0);
+ *rwlockp = NULL;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name));
+
+ /* Nothing to delete for Slim Reader Writer lock */
+
+ __wt_free(session, rwlock);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_once.c b/src/third_party/wiredtiger/src/os_win/os_once.c
new file mode 100644
index 00000000000..40640acf129
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_once.c
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_init_once_callback --
+ * Global initialization, run once.
+ */
+BOOL CALLBACK _wt_init_once_callback(
+ _Inout_ PINIT_ONCE InitOnce,
+ _Inout_opt_ PVOID Parameter,
+ _Out_opt_ PVOID *Context
+ )
+{
+ void(*init_routine)(void) = Parameter;
+
+ init_routine();
+
+ return (TRUE);
+}
+
+/*
+ * __wt_library_init --
+ * Some things to do, before we do anything else.
+ */
+int
+__wt_once(void(*init_routine)(void))
+{
+ INIT_ONCE once_control = INIT_ONCE_STATIC_INIT;
+ PVOID lpContext = NULL;
+
+ return !InitOnceExecuteOnce(&once_control, &_wt_init_once_callback,
+ init_routine, lpContext);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_open.c b/src/third_party/wiredtiger/src/os_win/os_open.c
new file mode 100644
index 00000000000..7be98b604ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_open.c
@@ -0,0 +1,219 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_open --
+ * Open a file handle.
+ */
+int
+__wt_open(WT_SESSION_IMPL *session,
+ const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
+{
+ DWORD dwCreationDisposition;
+ HANDLE filehandle, filehandle_secondary;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_FH *fh, *tfh;
+ int direct_io, f, matched, share_mode;
+ char *path;
+
+ conn = S2C(session);
+ fh = NULL;
+ path = NULL;
+ filehandle = INVALID_HANDLE_VALUE;
+ filehandle_secondary = INVALID_HANDLE_VALUE;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));
+
+ /* Increment the reference count if we already have the file open. */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched)
+ return (0);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+ /*
+ * Security:
+ * The application may spawn a new process, and we don't want another
+ * process to have access to our file handles.
+ *
+ * TODO: Set tighter file permissions but set bInheritHandle to false
+ * to prevent inheritance
+ */
+
+ f = FILE_ATTRIBUTE_NORMAL;
+
+ dwCreationDisposition = 0;
+ if (ok_create) {
+ dwCreationDisposition = CREATE_NEW;
+ if (exclusive)
+ dwCreationDisposition = CREATE_ALWAYS;
+ } else
+ dwCreationDisposition = OPEN_EXISTING;
+
+ direct_io = 0;
+
+ if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
+ f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+ direct_io = 1;
+ }
+
+ if (dio_type == WT_FILE_TYPE_LOG &&
+ FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
+ f |= FILE_FLAG_WRITE_THROUGH;
+ }
+
+ /* Disable read-ahead on trees: it slows down random read workloads. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ f |= FILE_FLAG_RANDOM_ACCESS;
+
+ filehandle = CreateFileA(path,
+ (GENERIC_READ | GENERIC_WRITE),
+ share_mode,
+ NULL,
+ dwCreationDisposition,
+ f,
+ NULL);
+ if (filehandle == INVALID_HANDLE_VALUE) {
+ if (GetLastError() == ERROR_FILE_EXISTS && ok_create)
+ filehandle = CreateFileA(path,
+ (GENERIC_READ | GENERIC_WRITE),
+ share_mode,
+ NULL,
+ OPEN_EXISTING,
+ f,
+ NULL);
+
+ if (filehandle == INVALID_HANDLE_VALUE)
+ WT_ERR_MSG(session, __wt_errno(),
+ direct_io ?
+ "%s: open failed with direct I/O configured, some "
+ "filesystem types do not support direct I/O" :
+ "%s", path);
+ }
+
+ /*
+ * Open a second handle to file to support allocation/truncation
+ * concurrently with reads on the file. Writes would also move the file
+ * pointer.
+ */
+ filehandle_secondary = CreateFileA(path,
+ (GENERIC_READ | GENERIC_WRITE),
+ share_mode,
+ NULL,
+ OPEN_EXISTING,
+ f,
+ NULL);
+ if (filehandle == INVALID_HANDLE_VALUE)
+ WT_ERR_MSG(session, __wt_errno(),
+ "open failed for secondary handle: %s", path);
+
+ WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
+ WT_ERR(__wt_strdup(session, name, &fh->name));
+ fh->filehandle = filehandle;
+ fh->filehandle_secondary = filehandle_secondary;
+ fh->ref = 1;
+ fh->direct_io = direct_io;
+
+ /* Set the file's size. */
+ WT_ERR(__wt_filesize(session, fh, &fh->size));
+
+ /* Configure file extension. */
+ if (dio_type == WT_FILE_TYPE_DATA ||
+ dio_type == WT_FILE_TYPE_CHECKPOINT)
+ fh->extend_len = conn->data_extend_len;
+
+ /* Configure fallocate/posix_fallocate calls. */
+ __wt_fallocate_config(session, fh);
+
+ /*
+ * Repeat the check for a match, but then link onto the database's list
+ * of files.
+ */
+ matched = 0;
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(tfh, &conn->fhqh, q)
+ if (strcmp(name, tfh->name) == 0) {
+ ++tfh->ref;
+ *fhp = tfh;
+ matched = 1;
+ break;
+ }
+ if (!matched) {
+ TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_INCR(session, file_open);
+
+ *fhp = fh;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+ if (matched) {
+err: if (fh != NULL) {
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ }
+ if (filehandle != INVALID_HANDLE_VALUE)
+ (void)CloseHandle(filehandle);
+ if (filehandle_secondary != INVALID_HANDLE_VALUE)
+ (void)CloseHandle(filehandle_secondary);
+ }
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_close --
+ * Close a file handle.
+ */
+int
+__wt_close(WT_SESSION_IMPL *session, WT_FH *fh)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ __wt_spin_lock(session, &conn->fh_lock);
+ if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
+ __wt_spin_unlock(session, &conn->fh_lock);
+ return (0);
+ }
+
+ /* Remove from the list. */
+ TAILQ_REMOVE(&conn->fhqh, fh, q);
+ WT_STAT_FAST_CONN_DECR(session, file_open);
+
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ /* Discard the memory. */
+ if (!CloseHandle(fh->filehandle) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "CloseHandle: %s", fh->name);
+ }
+
+ if (!CloseHandle(fh->filehandle_secondary) != 0) {
+ ret = __wt_errno();
+ __wt_err(session, ret, "CloseHandle: secondary: %s", fh->name);
+ }
+
+ __wt_free(session, fh->name);
+ __wt_free(session, fh);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_path.c b/src/third_party/wiredtiger/src/os_win/os_path.c
new file mode 100644
index 00000000000..9f6b79c565c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_path.c
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_absolute_path --
+ * Return if a filename is an absolute path.
+ */
+int
+__wt_absolute_path(const char *path)
+{
+ /*
+ * Check for a drive name (for example, "D:"), allow both forward and
+ * backward slashes.
+ */
+ if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
+ path += 2;
+ return (path[0] == '/' || path[0] == '\\' ? 1 : 0);
+}
+
+/*
+ * __wt_path_separator --
+ * Return the path separator string.
+ */
+const char *
+__wt_path_separator(void)
+{
+ return ("\\");
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_priv.c b/src/third_party/wiredtiger/src/os_win/os_priv.c
new file mode 100644
index 00000000000..7b5152b4652
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_priv.c
@@ -0,0 +1,19 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_has_priv --
+ * Return if the process has special privileges, defined as having
+ * different effective and read UIDs or GIDs.
+ */
+int
+__wt_has_priv(void)
+{
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_remove.c b/src/third_party/wiredtiger/src/os_win/os_remove.c
new file mode 100644
index 00000000000..d15ee929c00
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_remove.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __remove_file_check --
+ * Check if the file is currently open before removing it.
+ */
+static inline void
+__remove_file_check(WT_SESSION_IMPL *session, const char *name)
+{
+#ifdef HAVE_DIAGNOSTIC
+ WT_CONNECTION_IMPL *conn;
+ WT_FH *fh;
+
+ conn = S2C(session);
+ fh = NULL;
+
+ /*
+ * Check if the file is open: it's an error if it is, since a higher
+ * level should have closed it before removing.
+ */
+ __wt_spin_lock(session, &conn->fh_lock);
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
+ if (strcmp(name, fh->name) == 0)
+ break;
+ }
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ WT_ASSERT(session, fh == NULL);
+#else
+ WT_UNUSED(session);
+ WT_UNUSED(name);
+#endif
+}
+
+/*
+ * __wt_remove --
+ * Remove a file.
+ */
+int
+__wt_remove(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ char *path;
+ uint32_t lasterror;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: remove", name));
+
+ __remove_file_check(session, name);
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ if ((ret = DeleteFileA(path)) == FALSE)
+ lasterror = __wt_errno();
+
+ __wt_free(session, path);
+
+ if (ret != FALSE)
+ return (0);
+
+ WT_RET_MSG(session, lasterror, "%s: remove", name);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_rename.c b/src/third_party/wiredtiger/src/os_win/os_rename.c
new file mode 100644
index 00000000000..092f5d62a40
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_rename.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_rename --
+ * Rename a file.
+ */
+int
+__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+ WT_DECL_RET;
+ uint32_t lasterror;
+ char *from_path, *to_path;
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "rename %s to %s", from, to));
+
+ from_path = to_path = NULL;
+
+ WT_RET(__wt_filename(session, from, &from_path));
+ WT_TRET(__wt_filename(session, to, &to_path));
+
+ /*
+ * Check if file exists since Windows does not override the file if
+ * it exists.
+ */
+ if ((ret = GetFileAttributesA(to_path)) != INVALID_FILE_ATTRIBUTES) {
+ if ((ret = DeleteFileA(to_path)) == FALSE) {
+ lasterror = GetLastError();
+ goto err;
+ }
+ }
+
+ if ((MoveFileA(from_path, to_path)) == FALSE)
+ lasterror = GetLastError();
+
+err:
+ __wt_free(session, from_path);
+ __wt_free(session, to_path);
+
+ if (ret != FALSE)
+ return (0);
+
+ WT_RET_MSG(session, lasterror, "MoveFile %s to %s", from, to);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_rw.c b/src/third_party/wiredtiger/src/os_win/os_rw.c
new file mode 100644
index 00000000000..291533bc6bc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_rw.c
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_read --
+ * Read a chunk.
+ */
+int
+__wt_read(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+ DWORD chunk;
+ DWORD nr;
+ uint8_t *addr;
+ OVERLAPPED overlapped = { 0 };
+
+ nr = 0;
+
+ WT_STAT_FAST_CONN_INCR(session, read_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break reads larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
+ chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
+ overlapped.Offset = UINT32_MAX & offset;
+ overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
+
+ if (!ReadFile(fh->filehandle, addr, chunk, &nr, &overlapped))
+ WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
+ "%s read error: failed to read %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
+
+/*
+ * __wt_write --
+ * Write a chunk.
+ */
+int
+__wt_write(WT_SESSION_IMPL *session,
+ WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+ DWORD chunk;
+ DWORD nw;
+ const uint8_t *addr;
+ OVERLAPPED overlapped = { 0 };
+
+ nw = 0;
+
+ WT_STAT_FAST_CONN_INCR(session, write_io);
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
+ fh->name, len, (uintmax_t)offset));
+
+ /* Assert direct I/O is aligned and a multiple of the alignment. */
+ WT_ASSERT(session,
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ (!((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
+ len >= S2C(session)->buffer_alignment &&
+ len % S2C(session)->buffer_alignment == 0));
+
+ /* Break writes larger than 1GB into 1GB chunks. */
+ for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
+ chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
+ overlapped.Offset = UINT32_MAX & offset;
+ overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
+
+ if (!WriteFile(fh->filehandle, addr, chunk, &nw, &overlapped))
+ WT_RET_MSG(session, __wt_errno(),
+ "%s write error: failed to write %" WT_SIZET_FMT
+ " bytes at offset %" PRIuMAX,
+ fh->name, chunk, (uintmax_t)offset);
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_sleep.c b/src/third_party/wiredtiger/src/os_win/os_sleep.c
new file mode 100644
index 00000000000..b9a8cc2e545
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_sleep.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_sleep --
+ * Pause the thread of control.
+ */
+void
+__wt_sleep(long seconds, long micro_seconds)
+{
+ Sleep(seconds * 1000 + micro_seconds / 1000);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_thread.c b/src/third_party/wiredtiger/src/os_win/os_thread.c
new file mode 100644
index 00000000000..4d8cf89f264
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_thread.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_thread_create --
+ * Create a new thread of control.
+ */
+int
+__wt_thread_create(WT_SESSION_IMPL *session,
+ wt_thread_t *tidret, void *(*func)(void *), void *arg)
+{
+ /* Spawn a new thread of control. */
+ *tidret = CreateThread(NULL, 0, func, arg, 0, NULL);
+ if (*tidret != NULL)
+ return (0);
+
+ WT_RET_MSG(session, __wt_errno(), "CreateThread");
+}
+
+/*
+ * __wt_thread_join --
+ * Wait for a thread of control to exit.
+ */
+int
+__wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
+{
+ WT_DECL_RET;
+
+ if ((ret = WaitForSingleObject(tid, INFINITE)) == WAIT_OBJECT_0)
+ return (0);
+
+ WT_RET_MSG(session, ret, "WaitForSingleObject");
+}
+
+/*
+ * __wt_thread_id --
+ * Fill in a printable version of the process and thread IDs.
+ */
+void
+__wt_thread_id(char* buf, size_t buflen)
+{
+ (void)snprintf(buf, buflen,
+ "%" PRIu64 ":%" PRIu64,
+ (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_time.c b/src/third_party/wiredtiger/src/os_win/os_time.c
new file mode 100644
index 00000000000..b49b738fe54
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_time.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_seconds --
+ * Return the seconds since the Epoch.
+ */
+int
+__wt_seconds(WT_SESSION_IMPL *session, time_t *timep)
+{
+ struct timespec t;
+
+ WT_RET(__wt_epoch(session, &t));
+
+ *timep = t.tv_sec;
+
+ return (0);
+}
+
+/*
+ * __wt_epoch --
+ * Return the time since the Epoch.
+ */
+int
+__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp)
+{
+ uint64_t ns100;
+
+ FILETIME time;
+ GetSystemTimeAsFileTime(&time);
+
+ ns100 = (((int64_t)time.dwHighDateTime << 32) + time.dwLowDateTime)
+ - 116444736000000000LL;
+ tsp->tv_sec = ns100 / 10000000;
+ tsp->tv_nsec = (long)((ns100 % 10000000) * 100);
+
+ return (0);
+}
+
+/*
+ * localtime_r --
+ * Return the current local time.
+ */
+struct tm *
+localtime_r(const time_t *timer, struct tm *result)
+{
+ errno_t err;
+
+ err = localtime_s(result, timer);
+ if (err != 0) {
+ __wt_err(NULL, err, "localtime_s");
+ return (NULL);
+ }
+
+ return (result);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c
new file mode 100644
index 00000000000..1058203e326
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_vsnprintf.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#undef vsnprintf
+
+_Check_return_opt_ int __cdecl _wt_vsnprintf(
+ _Out_writes_(_MaxCount) char * _DstBuf,
+ _In_ size_t _MaxCount,
+ _In_z_ _Printf_format_string_ const char * _Format,
+ va_list _ArgList)
+{
+ int len;
+
+ len = (size_t)vsnprintf(_DstBuf, _MaxCount, _Format, _ArgList);
+
+ /*
+ * The MSVC implementation returns -1 on truncation instead of what
+ * it would have written. We could iteratively grow the buffer, or
+ * just ask us how big a buffer they would like.
+ */
+ if (len == -1)
+ len = _vscprintf(_Format, _ArgList) + 1;
+
+ return (len);
+}
diff --git a/src/third_party/wiredtiger/src/os_win/os_yield.c b/src/third_party/wiredtiger/src/os_win/os_yield.c
new file mode 100644
index 00000000000..970bfa139d0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/os_win/os_yield.c
@@ -0,0 +1,18 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_yield --
+ * Yield the thread of control.
+ */
+void
+__wt_yield(void)
+{
+ SwitchToThread();
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_api.c b/src/third_party/wiredtiger/src/packing/pack_api.c
new file mode 100644
index 00000000000..c0c1e53c8ca
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_api.c
@@ -0,0 +1,137 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * wiredtiger_struct_pack --
+ * Pack a byte string (extension API).
+ */
+int
+wiredtiger_struct_pack(WT_SESSION *wt_session,
+ void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_struct_size --
+ * Calculate the size of a packed byte string (extension API).
+ */
+int
+wiredtiger_struct_size(WT_SESSION *wt_session,
+ size_t *sizep, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_sizev(session, sizep, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * wiredtiger_struct_unpack --
+ * Unpack a byte string (extension API).
+ */
+int
+wiredtiger_struct_unpack(WT_SESSION *wt_session,
+ const void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_struct_pack --
+ * Pack a byte string (extension API).
+ */
+int
+__wt_ext_struct_pack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+ ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_struct_size --
+ * Calculate the size of a packed byte string (extension API).
+ */
+int
+__wt_ext_struct_size(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ size_t *sizep, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+ ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_sizev(session, sizep, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_struct_unpack --
+ * Unpack a byte string (extension API).
+ */
+int
+__wt_ext_struct_unpack(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session,
+ const void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ session = (wt_session != NULL) ? (WT_SESSION_IMPL *)wt_session :
+ ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c
new file mode 100644
index 00000000000..12b1582e6d0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_impl.c
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_struct_check --
+ * Check that the specified packing format is valid, and whether it fits
+ * into a fixed-sized bitfield.
+ */
+int
+__wt_struct_check(WT_SESSION_IMPL *session,
+ const char *fmt, size_t len, int *fixedp, uint32_t *fixed_lenp)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ int fields;
+
+ WT_RET(__pack_initn(session, &pack, fmt, len));
+ for (fields = 0; (ret = __pack_next(&pack, &pv)) == 0; fields++)
+ ;
+
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ if (fixedp != NULL && fixed_lenp != NULL) {
+ if (fields == 0) {
+ *fixedp = 1;
+ *fixed_lenp = 0;
+ } else if (fields == 1 && pv.type == 't') {
+ *fixedp = 1;
+ *fixed_lenp = pv.size;
+ } else
+ *fixedp = 0;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_struct_size --
+ * Calculate the size of a packed byte string.
+ */
+int
+__wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_sizev(session, sizep, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_struct_pack --
+ * Pack a byte string.
+ */
+int
+__wt_struct_pack(WT_SESSION_IMPL *session,
+ void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_packv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_struct_unpack --
+ * Unpack a byte string.
+ */
+int
+__wt_struct_unpack(WT_SESSION_IMPL *session,
+ const void *buffer, size_t size, const char *fmt, ...)
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_struct_unpackv(session, buffer, size, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/packing/pack_stream.c b/src/third_party/wiredtiger/src/packing/pack_stream.c
new file mode 100644
index 00000000000..efbbd5d9adb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/packing/pack_stream.c
@@ -0,0 +1,296 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Streaming interface to packing.
+ *
+ * This allows applications to pack or unpack records one field at a time.
+ */
+struct __wt_pack_stream {
+ WT_PACK pack;
+ uint8_t *end, *p, *start;
+};
+
+/*
+ * wiredtiger_pack_start --
+ * Open a stream for packing.
+ */
+int
+wiredtiger_pack_start(WT_SESSION *wt_session,
+ const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp)
+{
+ WT_DECL_RET;
+ WT_PACK_STREAM *ps;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ WT_RET(__wt_calloc_def(session, 1, &ps));
+ WT_ERR(__pack_init(session, &ps->pack, format));
+ ps->p = ps->start = buffer;
+ ps->end = ps->p + len;
+ *psp = ps;
+
+ if (0) {
+err: (void)wiredtiger_pack_close(ps, NULL);
+ }
+ return (ret);
+}
+
+/*
+ * wiredtiger_unpack_start --
+ * Open a stream for unpacking.
+ */
+int
+wiredtiger_unpack_start(WT_SESSION *wt_session, const char *format,
+ const void *buffer, size_t size, WT_PACK_STREAM **psp)
+{
+ return (wiredtiger_pack_start(
+ wt_session, format, (void *)buffer, size, psp));
+}
+
+/*
+ * wiredtiger_pack_close --
+ * Close a packing stream.
+ */
+int
+wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp)
+{
+ if (usedp != NULL)
+ *usedp = WT_PTRDIFF(ps->p, ps->start);
+
+ if (ps != NULL)
+ __wt_free(ps->pack.session, ps);
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_item --
+ * Pack an item.
+ */
+int
+wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'U':
+ case 'u':
+ pv.u.item.data = item->data;
+ pv.u.item.size = item->size;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_int --
+ * Pack a signed integer.
+ */
+int
+wiredtiger_pack_int(WT_PACK_STREAM *ps, int64_t i)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ pv.u.i = i;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_str --
+ * Pack a string.
+ */
+int
+wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'S':
+ case 's':
+ pv.u.s = s;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_pack_uint --
+ * Pack an unsigned int.
+ */
+int
+wiredtiger_pack_uint(WT_PACK_STREAM *ps, uint64_t u)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'R':
+ case 'r':
+ case 't':
+ pv.u.u = u;
+ WT_RET(__pack_write(
+ session, &pv, &ps->p, (size_t)(ps->end - ps->p)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_item --
+ * Unpack an item.
+ */
+int
+wiredtiger_unpack_item(WT_PACK_STREAM *ps, WT_ITEM *item)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'U':
+ case 'u':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ item->data = pv.u.item.data;
+ item->size = pv.u.item.size;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_int --
+ * Unpack a signed integer.
+ */
+int
+wiredtiger_unpack_int(WT_PACK_STREAM *ps, int64_t *ip)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'b':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'q':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *ip = pv.u.i;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_str --
+ * Unpack a string.
+ */
+int
+wiredtiger_unpack_str(WT_PACK_STREAM *ps, const char **sp)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'S':
+ case 's':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *sp = pv.u.s;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
+
+/*
+ * wiredtiger_unpack_uint --
+ * Unpack an unsigned integer.
+ */
+int
+wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_SESSION_IMPL *session;
+
+ session = ps->pack.session;
+
+ WT_RET(__pack_next(&ps->pack, &pv));
+ switch (pv.type) {
+ case 'B':
+ case 'H':
+ case 'I':
+ case 'L':
+ case 'Q':
+ case 'R':
+ case 'r':
+ case 't':
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p)));
+ *up = pv.u.u;
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c
new file mode 100644
index 00000000000..398fea4476f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_create.c
@@ -0,0 +1,595 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_direct_io_size_check --
+ * Return a size from the configuration, complaining if it's insufficient
+ * for direct I/O.
+ */
+int
+__wt_direct_io_size_check(WT_SESSION_IMPL *session,
+ const char **cfg, const char *config_name, uint32_t *allocsizep)
+{
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ int64_t align;
+
+ *allocsizep = 0;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_config_gets(session, cfg, config_name, &cval));
+
+ /*
+ * This function exists as a place to hang this comment: if direct I/O
+ * is configured, page sizes must be at least as large as any buffer
+ * alignment as well as a multiple of the alignment. Linux gets unhappy
+ * if you configure direct I/O and then don't do I/O in alignments and
+ * units of its happy place.
+ */
+ if (FLD_ISSET(conn->direct_io,
+ WT_FILE_TYPE_CHECKPOINT | WT_FILE_TYPE_DATA)) {
+ align = (int64_t)conn->buffer_alignment;
+ if (align != 0 && (cval.val < align || cval.val % align != 0))
+ WT_RET_MSG(session, EINVAL,
+ "when direct I/O is configured, the %s size must "
+ "be at least as large as the buffer alignment as "
+ "well as a multiple of the buffer alignment",
+ config_name);
+ }
+ *allocsizep = (uint32_t)cval.val;
+ return (0);
+}
+
+/*
+ * __create_file --
+ * Create a new 'file:' object.
+ */
+static int
+__create_file(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, const char *config)
+{
+ WT_DECL_ITEM(val);
+ WT_DECL_RET;
+ uint32_t allocsize;
+ int is_metadata;
+ const char *fileconf, *filename;
+ const char **p, *filecfg[] =
+ { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
+
+ fileconf = NULL;
+
+ is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;
+
+ filename = uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);
+
+ /* Check if the file already exists. */
+ if (!is_metadata && (ret =
+ __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
+ if (exclusive)
+ WT_TRET(EEXIST);
+ goto err;
+ }
+
+ /* Sanity check the allocation size. */
+ WT_RET(__wt_direct_io_size_check(
+ session, filecfg, "allocation_size", &allocsize));
+
+ /* Create the file. */
+ WT_ERR(__wt_block_manager_create(session, filename, allocsize));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_fileop(session, NULL, uri));
+
+ /*
+ * If creating an ordinary file, append the file ID and current version
+ * numbers to the passed-in configuration and insert the resulting
+ * configuration into the metadata.
+ */
+ if (!is_metadata) {
+ WT_ERR(__wt_scr_alloc(session, 0, &val));
+ WT_ERR(__wt_buf_fmt(session, val,
+ "id=%" PRIu32 ",version=(major=%d,minor=%d)",
+ ++S2C(session)->next_file_id,
+ WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
+ for (p = filecfg; *p != NULL; ++p)
+ ;
+ *p = val->data;
+ WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
+ WT_ERR(__wt_metadata_insert(session, uri, fileconf));
+ }
+
+ /*
+ * Open the file to check that it was setup correctly. We don't need
+ * to pass the configuration, we just wrote the collapsed configuration
+ * into the metadata file, and it's going to be read/used by underlying
+ * functions.
+ *
+ * Keep the handle exclusive until it is released at the end of the
+ * call, otherwise we could race with a drop.
+ */
+ WT_ERR(__wt_session_get_btree(
+ session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_handle_lock(session, 1));
+ else
+ WT_ERR(__wt_session_release_btree(session));
+
+err: __wt_scr_free(&val);
+ __wt_free(session, fileconf);
+ return (ret);
+}
+
+/*
+ * __wt_schema_colgroup_source --
+ * Get the URI of the data source for a column group.
+ */
+int
+__wt_schema_colgroup_source(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ size_t len;
+ const char *prefix, *suffix, *tablename;
+
+ tablename = table->name + strlen("table:");
+ if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 &&
+ !WT_STRING_MATCH("file", cval.str, cval.len)) {
+ prefix = cval.str;
+ len = cval.len;
+ suffix = "";
+ } else {
+ prefix = "file";
+ len = strlen(prefix);
+ suffix = ".wt";
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ if (cgname == NULL)
+ WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s%s",
+ (int)len, prefix, tablename, suffix));
+ else
+ WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s",
+ (int)len, prefix, tablename, cgname, suffix));
+
+ return (0);
+}
+
+/*
+ * __create_colgroup --
+ * Create a column group.
+ */
+static int
+__create_colgroup(WT_SESSION_IMPL *session,
+ const char *name, int exclusive, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_ITEM confbuf, fmt, namebuf;
+ WT_TABLE *table;
+ size_t tlen;
+ const char *cfg[4] =
+ { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
+ const char *sourcecfg[] = { config, NULL, NULL };
+ const char **cfgp;
+ const char *cgconf, *cgname, *sourceconf, *oldconf;
+ const char *source, *tablename;
+
+ cgconf = sourceconf = oldconf = NULL;
+ WT_CLEAR(fmt);
+ WT_CLEAR(confbuf);
+ WT_CLEAR(namebuf);
+
+ tablename = name;
+ if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
+ return (EINVAL);
+ cgname = strchr(tablename, ':');
+ if (cgname != NULL) {
+ tlen = (size_t)(cgname - tablename);
+ ++cgname;
+ } else
+ tlen = strlen(tablename);
+
+ if ((ret =
+ __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
+ WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret,
+ "Can't create '%s' for non-existent table '%.*s'",
+ name, (int)tlen, tablename);
+
+ /* Make sure the column group is referenced from the table. */
+ if (cgname != NULL && (ret =
+ __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "Column group '%s' not found in table '%.*s'",
+ cgname, (int)tlen, tablename);
+
+ /* Find the first NULL entry in the cfg stack. */
+ for (cfgp = &cfg[1]; *cfgp; cfgp++)
+ ;
+
+ /* Add the source to the colgroup config before collapsing. */
+ if (__wt_config_getones(
+ session, config, "source", &cval) == 0 && cval.len != 0) {
+ WT_ERR(__wt_buf_fmt(
+ session, &namebuf, "%.*s", (int)cval.len, cval.str));
+ source = namebuf.data;
+ } else {
+ WT_ERR(__wt_schema_colgroup_source(
+ session, table, cgname, config, &namebuf));
+ source = namebuf.data;
+ WT_ERR(__wt_buf_fmt(
+ session, &confbuf, "source=\"%s\"", source));
+ *cfgp++ = confbuf.data;
+ }
+
+ /* Calculate the key/value formats: these go into the source config. */
+ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format));
+ if (cgname == NULL)
+ WT_ERR(__wt_buf_catfmt
+ (session, &fmt, ",value_format=%s", table->value_format));
+ else {
+ if (__wt_config_getones(session, config, "columns", &cval) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "No 'columns' configuration for '%s'", name);
+ WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format="));
+ WT_ERR(__wt_struct_reformat(session,
+ table, cval.str, cval.len, NULL, 1, &fmt));
+ }
+ sourcecfg[1] = fmt.data;
+ WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf));
+
+ WT_ERR(__wt_schema_create(session, source, sourceconf));
+
+ WT_ERR(__wt_config_collapse(session, cfg, &cgconf));
+ if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) {
+ /*
+ * If the entry already exists in the metadata, we're done.
+ * This is an error for exclusive creates but okay otherwise.
+ */
+ if (ret == WT_DUPLICATE_KEY)
+ ret = exclusive ? EEXIST : 0;
+ goto err;
+ }
+
+ WT_ERR(__wt_schema_open_colgroups(session, table));
+
+err: __wt_free(session, cgconf);
+ __wt_free(session, sourceconf);
+ __wt_free(session, oldconf);
+ __wt_buf_free(session, &confbuf);
+ __wt_buf_free(session, &fmt);
+ __wt_buf_free(session, &namebuf);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __wt_schema_index_source --
+ * Get the URI of the data source for an index.
+ */
+int
+__wt_schema_index_source(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ size_t len;
+ const char *prefix, *suffix, *tablename;
+
+ tablename = table->name + strlen("table:");
+ if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 &&
+ !WT_STRING_MATCH("file", cval.str, cval.len)) {
+ prefix = cval.str;
+ len = cval.len;
+ suffix = "_idx";
+ } else {
+ prefix = "file";
+ len = strlen(prefix);
+ suffix = ".wti";
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s_%s%s",
+ (int)len, prefix, tablename, idxname, suffix));
+
+ return (0);
+}
+
+/*
+ * __create_index --
+ * Create an index.
+ */
+static int
+__create_index(WT_SESSION_IMPL *session,
+ const char *name, int exclusive, const char *config)
+{
+ WT_CONFIG pkcols;
+ WT_CONFIG_ITEM ckey, cval, icols;
+ WT_DECL_RET;
+ WT_ITEM confbuf, extra_cols, fmt, namebuf;
+ WT_TABLE *table;
+ const char *cfg[4] =
+ { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
+ const char *sourcecfg[] = { config, NULL, NULL };
+ const char *sourceconf, *source, *idxconf, *idxname;
+ const char *tablename;
+ size_t tlen;
+ u_int i;
+
+ idxconf = sourceconf = NULL;
+ WT_CLEAR(confbuf);
+ WT_CLEAR(fmt);
+ WT_CLEAR(extra_cols);
+ WT_CLEAR(namebuf);
+
+ tablename = name;
+ if (!WT_PREFIX_SKIP(tablename, "index:"))
+ return (EINVAL);
+ idxname = strchr(tablename, ':');
+ if (idxname == NULL)
+ WT_RET_MSG(session, EINVAL, "Invalid index name, "
+ "should be <table name>:<index name>: %s", name);
+
+ tlen = (size_t)(idxname++ - tablename);
+ if ((ret =
+ __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
+ WT_RET_MSG(session, ret,
+ "Can't create an index for a non-existent table: %.*s",
+ (int)tlen, tablename);
+
+ if (__wt_config_getones(session, config, "source", &cval) == 0) {
+ WT_ERR(__wt_buf_fmt(session, &namebuf,
+ "%.*s", (int)cval.len, cval.str));
+ source = namebuf.data;
+ } else {
+ WT_ERR(__wt_schema_index_source(
+ session, table, idxname, config, &namebuf));
+ source = namebuf.data;
+
+ /* Add the source name to the index config before collapsing. */
+ WT_ERR(__wt_buf_catfmt(session, &confbuf,
+ ",source=\"%s\"", source));
+ }
+
+ /* Calculate the key/value formats. */
+ if (__wt_config_getones(session, config, "columns", &icols) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "No 'columns' configuration for '%s'", name);
+
+ /*
+ * The key format for an index is somewhat subtle: the application
+ * specifies a set of columns that it will use for the key, but the
+ * engine usually adds some hidden columns in order to derive the
+ * primary key. These hidden columns are part of the source's
+ * key_format, which we are calculating now, but not part of an index
+ * cursor's key_format.
+ */
+ WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf));
+ for (i = 0; i < table->nkey_columns &&
+ (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0;
+ i++) {
+ /*
+ * If the primary key column is already in the secondary key,
+ * don't add it again.
+ */
+ if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0)
+ continue;
+ WT_ERR(__wt_buf_catfmt(
+ session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ goto err;
+
+ /*
+ * Index values are normally empty: all columns are packed into the
+ * index key. The exception is LSM, which (currently) reserves empty
+ * values as tombstones. Use a single padding byte in that case.
+ */
+ if (WT_PREFIX_MATCH(source, "lsm:"))
+ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=x,"));
+ else
+ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,"));
+ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));
+ WT_ERR(__wt_struct_reformat(session, table,
+ icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt));
+
+ /* Check for a record number index key, which makes no sense. */
+ WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval));
+ if (cval.len == 1 && cval.str[0] == 'r')
+ WT_ERR_MSG(session, EINVAL,
+ "column-store index may not use the record number as its "
+ "index key");
+
+ sourcecfg[1] = fmt.data;
+ WT_ERR(__wt_config_concat(session, sourcecfg, &sourceconf));
+
+ WT_ERR(__wt_schema_create(session, source, sourceconf));
+
+ cfg[1] = sourceconf;
+ cfg[2] = confbuf.data;
+ WT_ERR(__wt_config_collapse(session, cfg, &idxconf));
+ if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
+ /*
+ * If the entry already exists in the metadata, we're done.
+ * This is an error for exclusive creates but okay otherwise.
+ */
+ if (ret == WT_DUPLICATE_KEY)
+ ret = exclusive ? EEXIST : 0;
+ goto err;
+ }
+
+err: __wt_free(session, idxconf);
+ __wt_free(session, sourceconf);
+ __wt_buf_free(session, &confbuf);
+ __wt_buf_free(session, &extra_cols);
+ __wt_buf_free(session, &fmt);
+ __wt_buf_free(session, &namebuf);
+
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __create_table --
+ * Create a table.
+ */
+static int
+__create_table(WT_SESSION_IMPL *session,
+ const char *name, int exclusive, const char *config)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM cgkey, cgval, cval;
+ WT_DECL_RET;
+ WT_TABLE *table;
+ size_t cgsize;
+ int ncolgroups;
+ char *cgname;
+ const char *cfg[4] =
+ { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
+ const char *tableconf, *tablename;
+
+ cgname = NULL;
+ table = NULL;
+ tableconf = NULL;
+
+ tablename = name;
+ if (!WT_PREFIX_SKIP(tablename, "table:"))
+ return (EINVAL);
+
+ if ((ret = __wt_schema_get_table(session,
+ tablename, strlen(tablename), 0, &table)) == 0) {
+ __wt_schema_release_table(session, table);
+ return (exclusive ? EEXIST : 0);
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_config_gets(session, cfg, "colgroups", &cval));
+ WT_RET(__wt_config_subinit(session, &conf, &cval));
+ for (ncolgroups = 0;
+ (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
+ ncolgroups++)
+ ;
+ WT_RET_NOTFOUND_OK(ret);
+
+ WT_RET(__wt_config_collapse(session, cfg, &tableconf));
+ if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
+ /*
+ * If the entry already exists in the metadata, we're done.
+ * This is an error for exclusive creates but okay otherwise.
+ */
+ if (ret == WT_DUPLICATE_KEY)
+ ret = exclusive ? EEXIST : 0;
+ goto err;
+ }
+
+ /* Attempt to open the table now to catch any errors. */
+ WT_ERR(__wt_schema_get_table(
+ session, tablename, strlen(tablename), 1, &table));
+
+ if (ncolgroups == 0) {
+ cgsize = strlen("colgroup:") + strlen(tablename) + 1;
+ WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
+ snprintf(cgname, cgsize, "colgroup:%s", tablename);
+ WT_ERR(__create_colgroup(session, cgname, exclusive, config));
+ }
+
+ if (0) {
+err: if (table != NULL) {
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+ }
+ }
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
+ __wt_free(session, cgname);
+ __wt_free(session, tableconf);
+ return (ret);
+}
+
+/*
+ * __create_data_source --
+ * Create a custom data source.
+ */
+static int
+__create_data_source(WT_SESSION_IMPL *session,
+ const char *uri, const char *config, WT_DATA_SOURCE *dsrc)
+{
+ WT_CONFIG_ITEM cval;
+ const char *cfg[] = {
+ WT_CONFIG_BASE(session, session_create), config, NULL };
+
+ /*
+ * Check to be sure the key/value formats are legal: the underlying
+ * data source doesn't have access to the functions that check.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+ WT_RET(__wt_config_gets(session, cfg, "value_format", &cval));
+ WT_RET(__wt_struct_check(session, cval.str, cval.len, NULL, NULL));
+
+ /*
+ * User-specified collators aren't supported for data-source objects.
+ */
+ if (__wt_config_getones(
+ session, config, "collator", &cval) != WT_NOTFOUND)
+ WT_RET_MSG(session, EINVAL,
+ "WT_DATA_SOURCE objects do not support WT_COLLATOR "
+ "ordering");
+
+ return (dsrc->create(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg));
+}
+
+/*
+ * __wt_schema_create --
+ * Process a WT_SESSION::create operation for all supported types.
+ */
+int
+__wt_schema_create(
+ WT_SESSION_IMPL *session, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ int exclusive;
+
+ exclusive = (
+ __wt_config_getones(session, config, "exclusive", &cval) == 0 &&
+ cval.val != 0);
+
+ /*
+ * We track create operations: if we fail in the middle of creating a
+ * complex object, we want to back it all out.
+ */
+ WT_RET(__wt_meta_track_on(session));
+
+ if (WT_PREFIX_MATCH(uri, "colgroup:"))
+ ret = __create_colgroup(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "file:"))
+ ret = __create_file(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_create(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "index:"))
+ ret = __create_index(session, uri, exclusive, config);
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __create_table(session, uri, exclusive, config);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->create == NULL ?
+ __wt_object_unsupported(session, uri) :
+ __create_data_source(session, uri, config, dsrc);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ session->dhandle = NULL;
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
new file mode 100644
index 00000000000..6df7e6930c9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __drop_file --
+ * Drop a file.
+ */
+static int
+__drop_file(
+ WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ int exist, remove_files;
+ const char *filename;
+
+ WT_RET(__wt_config_gets(session, cfg, "remove_files", &cval));
+ remove_files = (cval.val != 0);
+
+ filename = uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ return (EINVAL);
+
+ /* Close all btree handles associated with this file. */
+ WT_RET(__wt_conn_dhandle_close_all(session, uri, force));
+
+ /* Remove the metadata entry (ignore missing items). */
+ WT_TRET(__wt_metadata_remove(session, uri));
+ if (force && ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (!remove_files)
+ return (ret);
+
+ /* Remove the underlying physical file. */
+ exist = 0;
+ WT_TRET(__wt_exist(session, filename, &exist));
+ if (exist) {
+ /*
+ * There is no point tracking this operation: there is no going
+ * back from here.
+ */
+ WT_TRET(__wt_remove(session, filename));
+ }
+
+ return (ret);
+}
+
+/*
+ * __drop_colgroup --
+ * WT_SESSION::drop for a colgroup.
+ */
+static int
+__drop_colgroup(
+ WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_COLGROUP *colgroup;
+ WT_DECL_RET;
+ WT_TABLE *table;
+
+ /* If we can get the colgroup, detach it from the table. */
+ if ((ret = __wt_schema_get_colgroup(
+ session, uri, &table, &colgroup)) == 0) {
+ table->cg_complete = 0;
+ WT_TRET(__wt_schema_drop(session, colgroup->source, cfg));
+ }
+
+ WT_TRET(__wt_metadata_remove(session, uri));
+ return (ret);
+}
+
+/*
+ * __drop_index --
+ * WT_SESSION::drop for a colgroup.
+ */
+static int
+__drop_index(
+ WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_INDEX *idx;
+ WT_DECL_RET;
+ WT_TABLE *table;
+
+ /* If we can get the colgroup, detach it from the table. */
+ if ((ret = __wt_schema_get_index(session, uri, &table, &idx)) == 0) {
+ table->idx_complete = 0;
+ WT_TRET(__wt_schema_drop(session, idx->source, cfg));
+ }
+
+ WT_TRET(__wt_metadata_remove(session, uri));
+ return (ret);
+}
+
+/*
+ * __drop_table --
+ * WT_SESSION::drop for a table.
+ */
+static int
+__drop_table(
+ WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
+{
+ WT_COLGROUP *colgroup;
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ const char *name;
+ u_int i;
+
+ name = uri;
+ (void)WT_PREFIX_SKIP(name, "table:");
+
+ table = NULL;
+ WT_ERR(__wt_schema_get_table(session, name, strlen(name), 1, &table));
+
+ /* Drop the column groups. */
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ if ((colgroup = table->cgroups[i]) == NULL)
+ continue;
+ WT_ERR(__wt_metadata_remove(session, colgroup->name));
+ WT_ERR(__wt_schema_drop(session, colgroup->source, cfg));
+ }
+
+ /* Drop the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ if ((idx = table->indices[i]) == NULL)
+ continue;
+ WT_ERR(__wt_metadata_remove(session, idx->name));
+ WT_ERR(__wt_schema_drop(session, idx->source, cfg));
+ }
+
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+
+ /* Remove the metadata entry (ignore missing items). */
+ WT_ERR(__wt_metadata_remove(session, uri));
+
+err: if (force && ret == WT_NOTFOUND)
+ ret = 0;
+ if (table != NULL)
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __wt_schema_drop --
+ * Process a WT_SESSION::drop operation for all supported types.
+ */
+int
+__wt_schema_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ int force;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = (cval.val != 0);
+
+ WT_RET(__wt_meta_track_on(session));
+
+ /* Be careful to ignore any btree handle in our caller. */
+ WT_CLEAR_BTREE_IN_SESSION(session);
+
+ if (WT_PREFIX_MATCH(uri, "colgroup:"))
+ ret = __drop_colgroup(session, uri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "file:"))
+ ret = __drop_file(session, uri, force, cfg);
+ else if (WT_PREFIX_MATCH(uri, "index:"))
+ ret = __drop_index(session, uri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_drop(session, uri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __drop_table(session, uri, force, cfg);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->drop == NULL ?
+ __wt_object_unsupported(session, uri) :
+ dsrc->drop(
+ dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ /*
+ * Map WT_NOTFOUND to ENOENT (or to 0 if "force" is set), based on the
+ * assumption WT_NOTFOUND means there was no metadata entry. The
+ * underlying drop functions should handle this case (we passed them
+ * the "force" value), but better safe than sorry.
+ */
+ if (ret == WT_NOTFOUND)
+ ret = force ? 0 : ENOENT;
+
+ /* Bump the schema generation so that stale data is ignored. */
+ ++S2C(session)->schema_gen;
+
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_list.c b/src/third_party/wiredtiger/src/schema/schema_list.c
new file mode 100644
index 00000000000..05421283bf6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_list.c
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __schema_add_table --
+ * Add a table handle to the session's cache.
+ */
+static int
+__schema_add_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, WT_TABLE **tablep)
+{
+ WT_TABLE *table;
+
+ WT_RET(__wt_schema_open_table(session, name, namelen, &table));
+
+ /* Copy the schema generation into the new table. */
+ table->schema_gen = S2C(session)->schema_gen;
+
+ TAILQ_INSERT_HEAD(&session->tables, table, q);
+ *tablep = table;
+
+ return (0);
+}
+
+/*
+ * __schema_find_table --
+ * Find the table handle for the named table in the session cache.
+ */
+static int
+__schema_find_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, WT_TABLE **tablep)
+{
+ WT_TABLE *table;
+ const char *tablename;
+
+restart:
+ TAILQ_FOREACH(table, &session->tables, q) {
+ tablename = table->name;
+ (void)WT_PREFIX_SKIP(tablename, "table:");
+ if (WT_STRING_MATCH(tablename, name, namelen)) {
+ /*
+ * Ignore stale tables.
+ *
+ * XXX: should be managed the same as btree handles,
+ * with a local cache in each session and a shared list
+ * in the connection. There is still a race here
+ * between checking the generation and opening the
+ * first column group.
+ */
+ if (table->schema_gen != S2C(session)->schema_gen) {
+ if (table->refcnt == 0) {
+ __wt_schema_remove_table(
+ session, table);
+ goto restart;
+ }
+ continue;
+ }
+ *tablep = table;
+ return (0);
+ }
+ }
+
+ return (WT_NOTFOUND);
+}
+
+/*
+ * __wt_schema_get_table --
+ * Get the table handle for the named table.
+ */
+int
+__wt_schema_get_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, int ok_incomplete, WT_TABLE **tablep)
+{
+ WT_DECL_RET;
+ WT_TABLE *table;
+
+ *tablep = table = NULL;
+ ret = __schema_find_table(session, name, namelen, &table);
+
+ if (ret == WT_NOTFOUND)
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __schema_add_table(session, name, namelen, &table));
+
+ if (ret == 0) {
+ if (!ok_incomplete && !table->cg_complete)
+ WT_RET_MSG(session, EINVAL, "'%s' cannot be used "
+ "until all column groups are created",
+ table->name);
+
+ ++table->refcnt;
+ *tablep = table;
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_schema_release_table --
+ * Release a table handle.
+ */
+void
+__wt_schema_release_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_ASSERT(session, table->refcnt > 0);
+ --table->refcnt;
+}
+
+/*
+ * __wt_schema_destroy_colgroup --
+ * Free a column group handle.
+ */
+void
+__wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP *colgroup)
+{
+ __wt_free(session, colgroup->name);
+ __wt_free(session, colgroup->source);
+ __wt_free(session, colgroup->config);
+ __wt_free(session, colgroup);
+}
+
+/*
+ * __wt_schema_destroy_index --
+ * Free an index handle.
+ */
+void
+__wt_schema_destroy_index(WT_SESSION_IMPL *session, WT_INDEX *idx)
+{
+ __wt_free(session, idx->name);
+ __wt_free(session, idx->source);
+ __wt_free(session, idx->config);
+ __wt_free(session, idx->key_format);
+ __wt_free(session, idx->key_plan);
+ __wt_free(session, idx->value_plan);
+ __wt_free(session, idx->idxkey_format);
+ __wt_free(session, idx);
+}
+
+/*
+ * __wt_schema_destroy_table --
+ * Free a table handle.
+ */
+void
+__wt_schema_destroy_table(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_COLGROUP *colgroup;
+ WT_INDEX *idx;
+ u_int i;
+
+ __wt_free(session, table->name);
+ __wt_free(session, table->config);
+ __wt_free(session, table->plan);
+ __wt_free(session, table->key_format);
+ __wt_free(session, table->value_format);
+ if (table->cgroups != NULL) {
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ if ((colgroup = table->cgroups[i]) == NULL)
+ continue;
+ __wt_schema_destroy_colgroup(session, colgroup);
+ }
+ __wt_free(session, table->cgroups);
+ }
+ if (table->indices != NULL) {
+ for (i = 0; i < table->nindices; i++) {
+ if ((idx = table->indices[i]) == NULL)
+ continue;
+ __wt_schema_destroy_index(session, idx);
+ }
+ __wt_free(session, table->indices);
+ }
+ __wt_free(session, table);
+}
+
+/*
+ * __wt_schema_remove_table --
+ * Remove the table handle from the session, closing if necessary.
+ */
+void
+__wt_schema_remove_table(
+ WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_ASSERT(session, table->refcnt <= 1);
+
+ TAILQ_REMOVE(&session->tables, table, q);
+ __wt_schema_destroy_table(session, table);
+}
+
+/*
+ * __wt_schema_close_tables --
+ * Close all of the tables in a session.
+ */
+void
+__wt_schema_close_tables(WT_SESSION_IMPL *session)
+{
+ WT_TABLE *table;
+
+ while ((table = TAILQ_FIRST(&session->tables)) != NULL)
+ __wt_schema_remove_table(session, table);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_open.c b/src/third_party/wiredtiger/src/schema/schema_open.c
new file mode 100644
index 00000000000..0332569a8e3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_open.c
@@ -0,0 +1,510 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_colgroup_name --
+ * Get the URI for a column group. This is used for metadata lookups.
+ * The only complexity here is that simple tables (with a single column
+ * group) use a simpler naming scheme.
+ */
+int
+__wt_schema_colgroup_name(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *cgname, size_t len, WT_ITEM *buf)
+{
+ const char *tablename;
+
+ tablename = table->name;
+ (void)WT_PREFIX_SKIP(tablename, "table:");
+
+ return ((table->ncolgroups == 0) ?
+ __wt_buf_fmt(session, buf, "colgroup:%s", tablename) :
+ __wt_buf_fmt(session, buf, "colgroup:%s:%.*s",
+ tablename, (int)len, cgname));
+}
+
+/*
+ * __wt_schema_open_colgroups --
+ * Open the column groups for a table.
+ */
+int
+__wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_COLGROUP *colgroup;
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, cval;
+ WT_DECL_RET;
+ WT_DECL_ITEM(buf);
+ const char *cgconfig;
+ u_int i;
+
+ if (table->cg_complete)
+ return (0);
+
+ colgroup = NULL;
+ cgconfig = NULL;
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+
+ WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
+
+ /* Open each column group. */
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ if (table->ncolgroups > 0)
+ WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
+ else
+ WT_CLEAR(ckey);
+
+ /*
+ * Always open from scratch: we may have failed part of the way
+ * through opening a table, or column groups may have changed.
+ */
+ if (table->cgroups[i] != NULL) {
+ __wt_schema_destroy_colgroup(
+ session, table->cgroups[i]);
+ table->cgroups[i] = NULL;
+ }
+
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_schema_colgroup_name(session, table,
+ ckey.str, ckey.len, buf));
+ if ((ret = __wt_metadata_search(
+ session, buf->data, &cgconfig)) != 0) {
+ /* It is okay if the table is incomplete. */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ WT_ERR(__wt_calloc_def(session, 1, &colgroup));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &colgroup->name));
+ colgroup->config = cgconfig;
+ cgconfig = NULL;
+ WT_ERR(__wt_config_getones(session,
+ colgroup->config, "columns", &colgroup->colconf));
+ WT_ERR(__wt_config_getones(
+ session, colgroup->config, "source", &cval));
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_buf_fmt(
+ session, buf, "%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &colgroup->source));
+ table->cgroups[i] = colgroup;
+ colgroup = NULL;
+ }
+
+ if (!table->is_simple) {
+ WT_ERR(__wt_table_check(session, table));
+
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_struct_plan(session,
+ table, table->colconf.str, table->colconf.len, 1, buf));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &table->plan));
+ }
+
+ table->cg_complete = 1;
+
+err: __wt_scr_free(&buf);
+ if (colgroup != NULL)
+ __wt_schema_destroy_colgroup(session, colgroup);
+ if (cgconfig != NULL)
+ __wt_free(session, cgconfig);
+ return (ret);
+}
+
+/*
+ * __open_index --
+ * Open an index.
+ */
+static int
+__open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx)
+{
+ WT_CONFIG colconf;
+ WT_CONFIG_ITEM ckey, cval;
+ WT_DECL_ITEM(buf);
+ WT_DECL_ITEM(plan);
+ WT_DECL_RET;
+ u_int cursor_key_cols, i;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /* Get the data source from the index config. */
+ WT_ERR(__wt_config_getones(session, idx->config, "source", &cval));
+ WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->source));
+
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval));
+ WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->key_format));
+
+ /*
+ * The key format for an index is somewhat subtle: the application
+ * specifies a set of columns that it will use for the key, but the
+ * engine usually adds some hidden columns in order to derive the
+ * primary key. These hidden columns are part of the file's key.
+ *
+ * The file's key_format is stored persistently, we need to calculate
+ * the index cursor key format (which will usually omit some of those
+ * keys).
+ */
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_config_getones(
+ session, idx->config, "columns", &idx->colconf));
+
+ /* Start with the declared index columns. */
+ WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf));
+ cursor_key_cols = 0;
+ while ((ret = __wt_config_next(&colconf, &ckey, &cval)) == 0) {
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, "%.*s,", (int)ckey.len, ckey.str));
+ ++cursor_key_cols;
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ goto err;
+
+ /*
+ * Now add any primary key columns from the table that are not
+ * already part of the index key.
+ */
+ WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf));
+ for (i = 0; i < table->nkey_columns &&
+ (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0;
+ i++) {
+ /*
+ * If the primary key column is already in the secondary key,
+ * don't add it again.
+ */
+ if (__wt_config_subgetraw(
+ session, &idx->colconf, &ckey, &cval) == 0)
+ continue;
+ WT_ERR(__wt_buf_catfmt(
+ session, buf, "%.*s,", (int)ckey.len, ckey.str));
+ }
+ if (ret != 0 && ret != WT_NOTFOUND)
+ goto err;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &plan));
+ WT_ERR(__wt_struct_plan(session, table, buf->data, buf->size, 0, plan));
+ WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->key_plan));
+
+ /* Set up the cursor key format (the visible columns). */
+ WT_ERR(__wt_buf_init(session, buf, 0));
+ WT_ERR(__wt_struct_truncate(session,
+ idx->key_format, cursor_key_cols, buf));
+ WT_ERR(__wt_strndup(
+ session, buf->data, buf->size, &idx->idxkey_format));
+
+ /* By default, index cursor values are the table value columns. */
+ /* TODO Optimize to use index columns in preference to table lookups. */
+ WT_ERR(__wt_buf_init(session, plan, 0));
+ WT_ERR(__wt_struct_plan(session,
+ table, table->colconf.str, table->colconf.len, 1, plan));
+ WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->value_plan));
+
+err: __wt_scr_free(&buf);
+ __wt_scr_free(&plan);
+ return (ret);
+}
+
+/*
+ * __wt_schema_open_index --
+ * Open one or more indices for a table.
+ */
+int
+__wt_schema_open_index(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ u_int i;
+ int cmp, match;
+ const char *idxconf, *name, *tablename, *uri;
+
+ /* Check if we've already done the work. */
+ if (idxname == NULL && table->idx_complete)
+ return (0);
+
+ cursor = NULL;
+ idx = NULL;
+
+ /* Build a search key. */
+ tablename = table->name;
+ (void)WT_PREFIX_SKIP(tablename, "table:");
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));
+
+ /* Find matching indices. */
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, tmp->data);
+ if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
+ ret = cursor->next(cursor);
+ for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ name = uri;
+ if (!WT_PREFIX_SKIP(name, tmp->data))
+ break;
+
+ /* Is this the index we are looking for? */
+ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);
+
+ /*
+ * Ensure there is space, including if we have to make room for
+ * a new entry in the middle of the list.
+ */
+ WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
+ WT_MAX(i, table->nindices) + 1, &table->indices));
+
+ /* Keep the in-memory list in sync with the metadata. */
+ cmp = 0;
+ while (table->indices[i] != NULL &&
+ (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
+ /* Index no longer exists, remove it. */
+ __wt_free(session, table->indices[i]);
+ memmove(&table->indices[i], &table->indices[i + 1],
+ (table->nindices - i) * sizeof(WT_INDEX *));
+ table->indices[--table->nindices] = NULL;
+ }
+ if (cmp < 0) {
+ /* Make room for a new index. */
+ memmove(&table->indices[i + 1], &table->indices[i],
+ (table->nindices - i) * sizeof(WT_INDEX *));
+ table->indices[i] = NULL;
+ ++table->nindices;
+ }
+
+ if (!match)
+ continue;
+
+ if (table->indices[i] == NULL) {
+ WT_ERR(cursor->get_value(cursor, &idxconf));
+ WT_ERR(__wt_calloc_def(session, 1, &idx));
+ WT_ERR(__wt_strdup(session, uri, &idx->name));
+ WT_ERR(__wt_strdup(session, idxconf, &idx->config));
+ WT_ERR(__open_index(session, table, idx));
+
+ table->indices[i] = idx;
+ idx = NULL;
+ }
+
+ /* If we were looking for a single index, we're done. */
+ if (indexp != NULL)
+ *indexp = table->indices[i];
+ if (idxname != NULL)
+ break;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* If we did a full pass, we won't need to do it again. */
+ if (idxname == NULL) {
+ table->nindices = i;
+ table->idx_complete = 1;
+ }
+
+err: __wt_scr_free(&tmp);
+ if (idx != NULL)
+ __wt_schema_destroy_index(session, idx);
+ if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_schema_open_indices --
+ * Open the indices for a table.
+ */
+int
+__wt_schema_open_indices(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ return (__wt_schema_open_index(session, table, NULL, 0, NULL));
+}
+
+/*
+ * __wt_schema_open_table --
+ * Open a named table.
+ */
+int
+__wt_schema_open_table(WT_SESSION_IMPL *session,
+ const char *name, size_t namelen, WT_TABLE **tablep)
+{
+ WT_CONFIG cparser;
+ WT_CONFIG_ITEM ckey, cval;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_TABLE *table;
+ const char *tconfig;
+ char *tablename;
+
+ cursor = NULL;
+ table = NULL;
+ tablename = NULL;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
+ WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename));
+
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ cursor->set_key(cursor, tablename);
+ WT_ERR(cursor->search(cursor));
+ WT_ERR(cursor->get_value(cursor, &tconfig));
+
+ WT_ERR(__wt_calloc_def(session, 1, &table));
+ table->name = tablename;
+ tablename = NULL;
+
+ WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval));
+
+ WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format));
+ WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format));
+ WT_ERR(__wt_strdup(session, tconfig, &table->config));
+
+ /* Point to some items in the copy to save re-parsing. */
+ WT_ERR(__wt_config_getones(session, table->config,
+ "columns", &table->colconf));
+
+ /*
+ * Count the number of columns: tables are "simple" if the columns
+ * are not named.
+ */
+ WT_ERR(__wt_config_subinit(session, &cparser, &table->colconf));
+ table->is_simple = 1;
+ while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ table->is_simple = 0;
+ if (ret != WT_NOTFOUND)
+ goto err;
+
+ /* Check that the columns match the key and value formats. */
+ if (!table->is_simple)
+ WT_ERR(__wt_schema_colcheck(session,
+ table->key_format, table->value_format, &table->colconf,
+ &table->nkey_columns, NULL));
+
+ WT_ERR(__wt_config_getones(session, table->config,
+ "colgroups", &table->cgconf));
+
+ /* Count the number of column groups. */
+ WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
+ table->ncolgroups = 0;
+ while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
+ ++table->ncolgroups;
+ if (ret != WT_NOTFOUND)
+ goto err;
+
+ WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups));
+ WT_ERR(__wt_schema_open_colgroups(session, table));
+ *tablep = table;
+
+ if (0) {
+err: if (table != NULL)
+ __wt_schema_destroy_table(session, table);
+ }
+ if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+
+ __wt_free(session, tablename);
+ __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_schema_get_colgroup --
+ * Find a column group by URI.
+ */
+int
+__wt_schema_get_colgroup(WT_SESSION_IMPL *session,
+ const char *uri, WT_TABLE **tablep, WT_COLGROUP **colgroupp)
+{
+ WT_COLGROUP *colgroup;
+ WT_TABLE *table;
+ const char *tablename, *tend;
+ u_int i;
+
+ *colgroupp = NULL;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
+ return (__wt_bad_object_type(session, uri));
+
+ if ((tend = strchr(tablename, ':')) == NULL)
+ tend = tablename + strlen(tablename);
+
+ WT_RET(__wt_schema_get_table(session,
+ tablename, WT_PTRDIFF(tend, tablename), 0, &table));
+
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ colgroup = table->cgroups[i];
+ if (strcmp(colgroup->name, uri) == 0) {
+ *colgroupp = colgroup;
+ if (tablep != NULL)
+ *tablep = table;
+ else
+ __wt_schema_release_table(session, table);
+ return (0);
+ }
+ }
+
+ __wt_schema_release_table(session, table);
+ WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
+}
+
+/*
+ * __wt_schema_get_index --
+ * Find a column group by URI.
+ */
+int
+__wt_schema_get_index(WT_SESSION_IMPL *session,
+ const char *uri, WT_TABLE **tablep, WT_INDEX **indexp)
+{
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_TABLE *table;
+ const char *tablename, *tend;
+ u_int i;
+
+ *indexp = NULL;
+
+ tablename = uri;
+ if (!WT_PREFIX_SKIP(tablename, "index:") ||
+ (tend = strchr(tablename, ':')) == NULL)
+ return (__wt_bad_object_type(session, uri));
+
+ WT_RET(__wt_schema_get_table(session,
+ tablename, WT_PTRDIFF(tend, tablename), 0, &table));
+
+ /* Try to find the index in the table. */
+ for (i = 0; i < table->nindices; i++) {
+ idx = table->indices[i];
+ if (strcmp(idx->name, uri) == 0) {
+ if (tablep != NULL)
+ *tablep = table;
+ else
+ __wt_schema_release_table(session, table);
+ *indexp = idx;
+ return (0);
+ }
+ }
+
+ /* Otherwise, open it. */
+ WT_ERR(__wt_schema_open_index(
+ session, table, tend + 1, strlen(tend + 1), indexp));
+
+err: __wt_schema_release_table(session, table);
+ WT_RET(ret);
+
+ if (*indexp != NULL)
+ return (0);
+
+ WT_RET_MSG(session, ENOENT, "%s not found in table", uri);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_plan.c b/src/third_party/wiredtiger/src/schema/schema_plan.c
new file mode 100644
index 00000000000..5abe0dd67d4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_plan.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __find_next_col --
+ * Find the next column to use for a plan.
+ */
+static int
+__find_next_col(WT_SESSION_IMPL *session, WT_TABLE *table,
+ WT_CONFIG_ITEM *colname, u_int *cgnump, u_int *colnump, char *coltype)
+{
+ WT_COLGROUP *colgroup;
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_RET;
+ u_int cg, col, foundcg, foundcol, matchcg, matchcol;
+ int getnext;
+
+ foundcg = foundcol = UINT_MAX;
+ matchcg = *cgnump;
+ matchcol = (*coltype == WT_PROJ_KEY) ?
+ *colnump : *colnump + table->nkey_columns;
+
+ getnext = 1;
+ for (colgroup = NULL, cg = 0; cg < WT_COLGROUPS(table); cg++) {
+ colgroup = table->cgroups[cg];
+
+ /*
+ * If there is only one column group, we just scan through all
+ * of the columns. For tables with multiple column groups, we
+ * look at the key columns once, then go through the value
+ * columns for each group.
+ */
+ if (cg == 0) {
+ cval = table->colconf;
+ col = 0;
+ } else {
+cgcols: cval = colgroup->colconf;
+ col = table->nkey_columns;
+ }
+ WT_RET(__wt_config_subinit(session, &conf, &cval));
+ for (; (ret = __wt_config_next(&conf, &k, &v)) == 0; col++) {
+ if (k.len == colname->len &&
+ strncmp(colname->str, k.str, k.len) == 0) {
+ if (getnext) {
+ foundcg = cg;
+ foundcol = col;
+ }
+ getnext = (cg == matchcg && col == matchcol);
+ }
+ if (cg == 0 && table->ncolgroups > 0 &&
+ col == table->nkey_columns - 1)
+ goto cgcols;
+ }
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ colgroup = NULL;
+ }
+
+ if (foundcg == UINT_MAX)
+ return (WT_NOTFOUND);
+
+ *cgnump = foundcg;
+ if (foundcol < table->nkey_columns) {
+ *coltype = WT_PROJ_KEY;
+ *colnump = foundcol;
+ } else {
+ *coltype = WT_PROJ_VALUE;
+ *colnump = foundcol - table->nkey_columns;
+ }
+ return (0);
+}
+
+/*
+ * __wt_schema_colcheck --
+ * Check that a list of columns matches a (key,value) format pair.
+ */
+int
+__wt_schema_colcheck(WT_SESSION_IMPL *session,
+ const char *key_format, const char *value_format, WT_CONFIG_ITEM *colconf,
+ u_int *kcolsp, u_int *vcolsp)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ WT_PACK pack;
+ u_int kcols, ncols, vcols;
+
+ WT_RET(__pack_init(session, &pack, key_format));
+ for (kcols = 0; (ret = __pack_next(&pack, &pv)) == 0; kcols++)
+ ;
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ WT_RET(__pack_init(session, &pack, value_format));
+ for (vcols = 0; (ret = __pack_next(&pack, &pv)) == 0; vcols++)
+ ;
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ /* Walk through the named columns. */
+ WT_RET(__wt_config_subinit(session, &conf, colconf));
+ for (ncols = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; ncols++)
+ ;
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ if (ncols != 0 && ncols != kcols + vcols)
+ WT_RET_MSG(session, EINVAL, "Number of columns in '%.*s' "
+ "does not match key format '%s' plus value format '%s'",
+ (int)colconf->len, colconf->str, key_format, value_format);
+
+ if (kcolsp != NULL)
+ *kcolsp = kcols;
+ if (vcolsp != NULL)
+ *vcolsp = vcols;
+
+ return (0);
+}
+
+/*
+ * __wt_table_check --
+ * Make sure all columns appear in a column group.
+ */
+int
+__wt_table_check(WT_SESSION_IMPL *session, WT_TABLE *table)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ u_int cg, col, i;
+ char coltype;
+
+ if (table->is_simple)
+ return (0);
+
+ /* Walk through the columns. */
+ WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
+
+ /* Skip over the key columns. */
+ for (i = 0; i < table->nkey_columns; i++)
+ WT_RET(__wt_config_next(&conf, &k, &v));
+ cg = col = 0;
+ coltype = 0;
+ while ((ret = __wt_config_next(&conf, &k, &v)) == 0) {
+ if (__find_next_col(
+ session, table, &k, &cg, &col, &coltype) != 0)
+ WT_RET_MSG(session, EINVAL,
+ "Column '%.*s' in '%s' does not appear in a "
+ "column group",
+ (int)k.len, k.str, table->name);
+ /*
+ * Column groups can't store key columns in their value:
+ * __wt_struct_reformat should have already detected this case.
+ */
+ WT_ASSERT(session, coltype == WT_PROJ_VALUE);
+
+ }
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_plan --
+ * Given a table cursor containing a complete table, build the "projection
+ * plan" to distribute the columns to dependent stores. A string
+ * representing the plan will be appended to the plan buffer.
+ */
+int
+__wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table,
+ const char *columns, size_t len, int value_only, WT_ITEM *plan)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ u_int cg, col, current_cg, current_col, i, start_cg, start_col;
+ int have_it;
+ char coltype, current_coltype;
+
+ start_cg = start_col = UINT_MAX; /* -Wuninitialized */
+
+ /* Work through the value columns by skipping over the key columns. */
+ WT_RET(__wt_config_initn(session, &conf, columns, len));
+ if (value_only)
+ for (i = 0; i < table->nkey_columns; i++)
+ WT_RET(__wt_config_next(&conf, &k, &v));
+
+ current_cg = cg = 0;
+ current_col = col = INT_MAX;
+ current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */
+ for (i = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; i++) {
+ have_it = 0;
+
+ while (__find_next_col(session, table,
+ &k, &cg, &col, &coltype) == 0 &&
+ (!have_it || cg != start_cg || col != start_col)) {
+ /*
+ * First we move to the column. If that is in a
+ * different column group to the last column we
+ * accessed, or before the last column in the same
+ * column group, or moving from the key to the value,
+ * we need to switch column groups or rewind.
+ */
+ if (current_cg != cg || current_col > col ||
+ current_coltype != coltype) {
+ WT_ASSERT(session, !value_only ||
+ coltype == WT_PROJ_VALUE);
+ WT_RET(__wt_buf_catfmt(
+ session, plan, "%d%c", cg, coltype));
+
+ /*
+ * Set the current column group and column
+ * within the table.
+ */
+ current_cg = cg;
+ current_col = 0;
+ current_coltype = coltype;
+ }
+ /* Now move to the column we want. */
+ if (current_col < col) {
+ if (col - current_col > 1)
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%d", col - current_col));
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%c", WT_PROJ_SKIP));
+ }
+ /*
+ * Now copy the value in / out. In the common case,
+ * where each value is used in one column, we do a
+ * "next" operation. If the value is used again, we do
+ * a "reuse" operation to avoid making another copy.
+ */
+ if (!have_it) {
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%c", WT_PROJ_NEXT));
+
+ start_cg = cg;
+ start_col = col;
+ have_it = 1;
+ } else
+ WT_RET(__wt_buf_catfmt(session,
+ plan, "%c", WT_PROJ_REUSE));
+ current_col = col + 1;
+ }
+ }
+ WT_RET_TEST(ret != WT_NOTFOUND, ret);
+
+ /* Special case empty plans. */
+ if (i == 0 && plan->size == 0)
+ WT_RET(__wt_buf_set(session, plan, "", 1));
+
+ return (0);
+}
+
+/*
+ * __find_column_format --
+ * Find the format of the named column.
+ */
+static int
+__find_column_format(WT_SESSION_IMPL *session,
+ WT_TABLE *table, WT_CONFIG_ITEM *colname, int value_only, WT_PACK_VALUE *pv)
+{
+ WT_CONFIG conf;
+ WT_CONFIG_ITEM k, v;
+ WT_DECL_RET;
+ WT_PACK pack;
+ int inkey;
+
+ WT_RET(__wt_config_subinit(session, &conf, &table->colconf));
+ WT_RET(__pack_init(session, &pack, table->key_format));
+ inkey = 1;
+
+ while ((ret = __wt_config_next(&conf, &k, &v)) == 0) {
+ if ((ret = __pack_next(&pack, pv)) == WT_NOTFOUND && inkey) {
+ ret = __pack_init(session, &pack, table->value_format);
+ if (ret == 0)
+ ret = __pack_next(&pack, pv);
+ inkey = 0;
+ }
+ if (ret != 0)
+ return (ret);
+
+ if (k.len == colname->len &&
+ strncmp(colname->str, k.str, k.len) == 0) {
+ if (value_only && inkey)
+ return (EINVAL);
+ return (0);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_struct_reformat --
+ * Given a table and a list of columns (which could be values in a column
+ * group or index keys), calculate the resulting new format string.
+ * The result will be appended to the format buffer.
+ */
+int
+__wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table,
+ const char *columns, size_t len, const char *extra_cols, int value_only,
+ WT_ITEM *format)
+{
+ WT_CONFIG config;
+ WT_CONFIG_ITEM k, next_k, next_v;
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_RET;
+ int have_next;
+
+ WT_RET(__wt_config_initn(session, &config, columns, len));
+ /*
+ * If an empty column list is specified, this will fail with
+ * WT_NOTFOUND, that's okay.
+ */
+ WT_RET_NOTFOUND_OK(ret = __wt_config_next(&config, &next_k, &next_v));
+ if (ret == WT_NOTFOUND) {
+ if (format->size == 0)
+ WT_RET(__wt_buf_set(session, format, "", 1));
+ return (0);
+ }
+ do {
+ k = next_k;
+ ret = __wt_config_next(&config, &next_k, &next_v);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ have_next = (ret == 0);
+
+ if (!have_next && extra_cols != NULL) {
+ WT_RET(__wt_config_init(session, &config, extra_cols));
+ WT_RET(__wt_config_next(&config, &next_k, &next_v));
+ have_next = 1;
+ extra_cols = NULL;
+ }
+
+ if ((ret = __find_column_format(session,
+ table, &k, value_only, &pv)) != 0) {
+ if (value_only && ret == EINVAL)
+ WT_RET_MSG(session, EINVAL,
+ "A column group cannot store key column "
+ "'%.*s' in its value", (int)k.len, k.str);
+ WT_RET_MSG(session, EINVAL,
+ "Column '%.*s' not found", (int)k.len, k.str);
+ }
+
+ /*
+ * Check whether we're moving an unsized WT_ITEM from the end
+ * to the middle, or vice-versa. This determines whether the
+ * size needs to be prepended. This is the only case where the
+ * destination size can be larger than the source size.
+ */
+ if (pv.type == 'u' && !pv.havesize && have_next)
+ pv.type = 'U';
+ else if (pv.type == 'U' && !have_next)
+ pv.type = 'u';
+
+ if (pv.havesize)
+ WT_RET(__wt_buf_catfmt(
+ session, format, "%d%c", (int)pv.size, pv.type));
+ else
+ WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
+ } while (have_next);
+
+ return (0);
+}
+
+/*
+ * __wt_struct_truncate --
+ * Return a packing string for the first N columns in a value.
+ */
+int
+__wt_struct_truncate(WT_SESSION_IMPL *session,
+ const char *input_fmt, u_int ncols, WT_ITEM *format)
+{
+ WT_DECL_PACK_VALUE(pv);
+ WT_PACK pack;
+
+ WT_RET(__pack_init(session, &pack, input_fmt));
+ while (ncols-- > 0) {
+ WT_RET(__pack_next(&pack, &pv));
+ if (pv.havesize)
+ WT_RET(__wt_buf_catfmt(
+ session, format, "%d%c", (int)pv.size, pv.type));
+ else
+ WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type));
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_project.c b/src/third_party/wiredtiger/src/schema/schema_project.c
new file mode 100644
index 00000000000..9aff4c8dded
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_project.c
@@ -0,0 +1,474 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_project_in --
+ * Given list of cursors and a projection, read columns from the
+ * application into the dependent cursors.
+ */
+int
+__wt_schema_project_in(WT_SESSION_IMPL *session,
+ WT_CURSOR **cp, const char *proj_arg, va_list ap)
+{
+ WT_CURSOR *c;
+ WT_DECL_ITEM(buf);
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_PACK(pack);
+ WT_PACK_VALUE old_pv;
+ size_t len, offset, old_len;
+ u_long arg;
+ char *proj;
+ uint8_t *p, *end;
+ const uint8_t *next;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ /* Reset any of the buffers we will be setting. */
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+ if (*proj == WT_PROJ_KEY) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->key, 0));
+ } else if (*proj == WT_PROJ_VALUE) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->value, 0));
+ }
+ }
+
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ buf = &c->key;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ buf = &c->value;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+ }
+
+ /* We have to get a key or value before any operations. */
+ WT_ASSERT(session, buf != NULL);
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_SKIP:
+ WT_RET(__pack_next(&pack, &pv));
+ /*
+ * A nasty case: if we are inserting
+ * out-of-order, we may reach the end of the
+ * data. That's okay: we want to append in
+ * that case, and we're positioned to do that.
+ */
+ if (p == end) {
+ /* Set up an empty value. */
+ WT_CLEAR(pv.u);
+ if (pv.type == 'S' || pv.type == 's')
+ pv.u.s = "";
+
+ len = __pack_size(session, &pv);
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len));
+ p = (uint8_t *)buf->mem + buf->size;
+ WT_RET(__pack_write(
+ session, &pv, &p, len));
+ buf->size += len;
+ end = (uint8_t *)buf->mem + buf->size;
+ } else if (*proj == WT_PROJ_SKIP)
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&p,
+ (size_t)(end - p)));
+ break;
+
+ case WT_PROJ_NEXT:
+ WT_RET(__pack_next(&pack, &pv));
+ WT_PACK_GET(session, pv, ap);
+ /* FALLTHROUGH */
+
+ case WT_PROJ_REUSE:
+ /* Read the item we're about to overwrite. */
+ next = p;
+ if (p < end) {
+ old_pv = pv;
+ WT_RET(__unpack_read(session, &old_pv,
+ &next, (size_t)(end - p)));
+ }
+ old_len = (size_t)(next - p);
+
+ len = __pack_size(session, &pv);
+ offset = WT_PTRDIFF(p, buf->mem);
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len));
+ p = (uint8_t *)buf->mem + offset;
+ end = (uint8_t *)buf->mem + buf->size + len;
+ /* Make room if we're inserting out-of-order. */
+ if (offset + old_len < buf->size)
+ memmove(p + len, p + old_len,
+ buf->size - (offset + old_len));
+ WT_RET(__pack_write(session, &pv, &p, len));
+ buf->size += len;
+ break;
+
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unexpected projection plan: %c",
+ (int)*proj);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_schema_project_out --
+ * Given list of cursors and a projection, read columns from the
+ * dependent cursors and return them to the application.
+ */
+int
+__wt_schema_project_out(WT_SESSION_IMPL *session,
+ WT_CURSOR **cp, const char *proj_arg, va_list ap)
+{
+ WT_CURSOR *c;
+ WT_DECL_PACK(pack);
+ WT_DECL_PACK_VALUE(pv);
+ u_long arg;
+ char *proj;
+ uint8_t *p, *end;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ p = (uint8_t *)c->key.data;
+ end = p + c->key.size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ p = (uint8_t *)c->value.data;
+ end = p + c->value.size;
+ continue;
+ }
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_NEXT:
+ case WT_PROJ_SKIP:
+ case WT_PROJ_REUSE:
+ WT_RET(__pack_next(&pack, &pv));
+ WT_RET(__unpack_read(session, &pv,
+ (const uint8_t **)&p, (size_t)(end - p)));
+ /* Only copy the value out once. */
+ if (*proj != WT_PROJ_NEXT)
+ break;
+ WT_UNPACK_PUT(session, pv, ap);
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_schema_project_slice --
+ * Given list of cursors and a projection, read columns from the
+ * a raw buffer.
+ */
+int
+__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
+ const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value)
+{
+ WT_CURSOR *c;
+ WT_DECL_ITEM(buf);
+ WT_DECL_PACK(pack);
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_PACK_VALUE(vpv);
+ WT_PACK vpack;
+ u_long arg;
+ char *proj;
+ uint8_t *end, *p;
+ const uint8_t *next, *vp, *vend;
+ size_t len, offset, old_len;
+ int skip;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ WT_RET(__pack_init(session, &vpack, vformat));
+ vp = value->data;
+ vend = vp + value->size;
+
+ /* Reset any of the buffers we will be setting. */
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+ if (*proj == WT_PROJ_KEY) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->key, 0));
+ } else if (*proj == WT_PROJ_VALUE && !key_only) {
+ c = cp[arg];
+ WT_RET(__wt_buf_init(session, &c->value, 0));
+ }
+ }
+
+ skip = key_only;
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ skip = 0;
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ buf = &c->key;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ if ((skip = key_only) != 0)
+ continue;
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ buf = &c->value;
+ p = (uint8_t *)buf->data;
+ end = p + buf->size;
+ continue;
+ }
+
+ /* We have to get a key or value before any operations. */
+ WT_ASSERT(session, skip || buf != NULL);
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_SKIP:
+ if (skip)
+ break;
+ WT_RET(__pack_next(&pack, &pv));
+
+ /*
+ * A nasty case: if we are inserting
+ * out-of-order, append a zero value to keep
+ * the buffer in the correct format.
+ */
+ if (p == end) {
+ /* Set up an empty value. */
+ WT_CLEAR(pv.u);
+ if (pv.type == 'S' || pv.type == 's')
+ pv.u.s = "";
+
+ len = __pack_size(session, &pv);
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len));
+ p = (uint8_t *)buf->data + buf->size;
+ WT_RET(__pack_write(
+ session, &pv, &p, len));
+ end = p;
+ buf->size += len;
+ } else
+ WT_RET(__unpack_read(session,
+ &pv, (const uint8_t **)&p,
+ (size_t)(end - p)));
+ break;
+
+ case WT_PROJ_NEXT:
+ WT_RET(__pack_next(&vpack, &vpv));
+ WT_RET(__unpack_read(session, &vpv,
+ &vp, (size_t)(vend - vp)));
+ /* FALLTHROUGH */
+
+ case WT_PROJ_REUSE:
+ if (skip)
+ break;
+
+ /*
+ * Read the item we're about to overwrite.
+ *
+ * There is subtlety here: the value format
+ * may not exactly match the cursor's format.
+ * In particular, we need lengths with raw
+ * columns in the middle of a packed struct,
+ * but not if they are at the end of a struct.
+ */
+ WT_RET(__pack_next(&pack, &pv));
+
+ next = p;
+ if (p < end)
+ WT_RET(__unpack_read(session, &pv,
+ &next, (size_t)(end - p)));
+ old_len = (size_t)(next - p);
+
+ /* Make sure the types are compatible. */
+ WT_ASSERT(session,
+ tolower(pv.type) == tolower(vpv.type));
+ pv.u = vpv.u;
+
+ len = __pack_size(session, &pv);
+ offset = WT_PTRDIFF(p, buf->data);
+ /*
+ * Avoid growing the buffer if the value fits.
+ * This is not just a performance issue: it
+ * covers the case of record number keys, which
+ * have to be written to cursor->recno.
+ */
+ if (len > old_len)
+ WT_RET(__wt_buf_grow(session,
+ buf, buf->size + len - old_len));
+ p = (uint8_t *)buf->data + offset;
+ /* Make room if we're inserting out-of-order. */
+ if (offset + old_len < buf->size)
+ memmove(p + len, p + old_len,
+ buf->size - (offset + old_len));
+ WT_RET(__pack_write(session, &pv, &p, len));
+ buf->size += len - old_len;
+ end = (uint8_t *)buf->data + buf->size;
+ break;
+ default:
+ WT_RET_MSG(session, EINVAL,
+ "unexpected projection plan: %c",
+ (int)*proj);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_schema_project_merge --
+ * Given list of cursors and a projection, build a buffer containing the
+ * column values read from the cursors.
+ */
+int
+__wt_schema_project_merge(WT_SESSION_IMPL *session,
+ WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
+{
+ WT_CURSOR *c;
+ WT_ITEM *buf;
+ WT_DECL_PACK(pack);
+ WT_DECL_PACK_VALUE(pv);
+ WT_DECL_PACK_VALUE(vpv);
+ WT_PACK vpack;
+ u_long arg;
+ char *proj;
+ const uint8_t *p, *end;
+ uint8_t *vp;
+ size_t len;
+
+ p = end = NULL; /* -Wuninitialized */
+
+ WT_RET(__wt_buf_init(session, value, 0));
+ WT_RET(__pack_init(session, &vpack, vformat));
+
+ for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
+ arg = strtoul(proj, &proj, 10);
+
+ switch (*proj) {
+ case WT_PROJ_KEY:
+ c = cp[arg];
+ if (WT_CURSOR_RECNO(c)) {
+ c->key.data = &c->recno;
+ c->key.size = sizeof(c->recno);
+ WT_RET(__pack_init(session, &pack, "R"));
+ } else
+ WT_RET(__pack_init(
+ session, &pack, c->key_format));
+ buf = &c->key;
+ p = buf->data;
+ end = p + buf->size;
+ continue;
+
+ case WT_PROJ_VALUE:
+ c = cp[arg];
+ WT_RET(__pack_init(session, &pack, c->value_format));
+ buf = &c->value;
+ p = buf->data;
+ end = p + buf->size;
+ continue;
+ }
+
+ /*
+ * Otherwise, the argument is a count, where a missing
+ * count means a count of 1.
+ */
+ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
+ switch (*proj) {
+ case WT_PROJ_NEXT:
+ case WT_PROJ_SKIP:
+ case WT_PROJ_REUSE:
+ WT_RET(__pack_next(&pack, &pv));
+ WT_RET(__unpack_read(session, &pv,
+ &p, (size_t)(end - p)));
+ /* Only copy the value out once. */
+ if (*proj != WT_PROJ_NEXT)
+ break;
+
+ WT_RET(__pack_next(&vpack, &vpv));
+ /* Make sure the types are compatible. */
+ WT_ASSERT(session,
+ tolower(pv.type) == tolower(vpv.type));
+ vpv.u = pv.u;
+ len = __pack_size(session, &vpv);
+ WT_RET(__wt_buf_grow(session,
+ value, value->size + len));
+ vp = (uint8_t *)value->mem + value->size;
+ WT_RET(__pack_write(session, &vpv, &vp, len));
+ value->size += len;
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
new file mode 100644
index 00000000000..8605ea41c80
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __rename_file --
+ * WT_SESSION::rename for a file.
+ */
+static int
+__rename_file(
+ WT_SESSION_IMPL *session, const char *uri, const char *newuri)
+{
+ WT_DECL_RET;
+ int exist;
+ const char *filename, *newfile, *newvalue, *oldvalue;
+
+ newvalue = oldvalue = NULL;
+
+ filename = uri;
+ newfile = newuri;
+ if (!WT_PREFIX_SKIP(filename, "file:") ||
+ !WT_PREFIX_SKIP(newfile, "file:"))
+ return (EINVAL);
+
+ /* Close any btree handles in the file. */
+ WT_ERR(__wt_conn_dhandle_close_all(session, uri, 0));
+
+ /*
+ * First, check if the file being renamed exists in the system. Doing
+ * this check first matches the table rename behavior because we return
+ * WT_NOTFOUND when the renamed file doesn't exist (subsequently mapped
+ * to ENOENT by the session layer).
+ */
+ WT_ERR(__wt_metadata_search(session, uri, &oldvalue));
+
+ /*
+ * Check to see if the proposed name is already in use, in either the
+ * metadata or the filesystem.
+ */
+ switch (ret = __wt_metadata_search(session, newuri, &newvalue)) {
+ case 0:
+ WT_ERR_MSG(session, EEXIST, "%s", newuri);
+ /* NOTREACHED */
+ case WT_NOTFOUND:
+ break;
+ default:
+ WT_ERR(ret);
+ }
+ WT_ERR(__wt_exist(session, newfile, &exist));
+ if (exist)
+ WT_ERR_MSG(session, EEXIST, "%s", newfile);
+
+ /* Replace the old file entries with new file entries. */
+ WT_ERR(__wt_metadata_remove(session, uri));
+ WT_ERR(__wt_metadata_insert(session, newuri, oldvalue));
+
+ /* Rename the underlying file. */
+ WT_ERR(__wt_rename(session, filename, newfile));
+ if (WT_META_TRACKING(session))
+ WT_ERR(__wt_meta_track_fileop(session, uri, newuri));
+
+err: __wt_free(session, newvalue);
+ __wt_free(session, oldvalue);
+ return (ret);
+}
+
+/*
+ * __rename_tree --
+ * Rename an index or colgroup reference.
+ */
+static int
+__rename_tree(WT_SESSION_IMPL *session,
+ WT_TABLE *table, const char *newuri, const char *name, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_ITEM(nn);
+ WT_DECL_ITEM(ns);
+ WT_DECL_ITEM(nv);
+ WT_DECL_ITEM(os);
+ WT_DECL_RET;
+ const char *newname, *olduri, *suffix, *value;
+ int is_colgroup;
+
+ olduri = table->name;
+ value = NULL;
+
+ newname = newuri;
+ (void)WT_PREFIX_SKIP(newname, "table:");
+
+ /*
+ * Create the new data source URI and update the schema value.
+ *
+ * 'name' has the format (colgroup|index):<tablename>[:<suffix>];
+ * we need the suffix.
+ */
+ is_colgroup = WT_PREFIX_MATCH(name, "colgroup:");
+ if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:"))
+ WT_ERR_MSG(session, EINVAL,
+ "expected a 'colgroup:' or 'index:' source: '%s'", name);
+
+ suffix = strchr(name, ':');
+ /* An existing table should have a well formed name. */
+ WT_ASSERT(session, suffix != NULL);
+ suffix = strchr(suffix + 1, ':');
+
+ WT_ERR(__wt_scr_alloc(session, 0, &nn));
+ WT_ERR(__wt_buf_fmt(session, nn, "%s%s%s",
+ is_colgroup ? "colgroup:" : "index:",
+ newname,
+ (suffix == NULL) ? "" : suffix));
+
+ /* Skip the colon, if any. */
+ if (suffix != NULL)
+ ++suffix;
+
+ /* Read the old schema value. */
+ WT_ERR(__wt_metadata_search(session, name, &value));
+
+ /*
+ * Calculate the new data source URI. Use the existing table structure
+ * and substitute the new name temporarily.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &ns));
+ table->name = newuri;
+ if (is_colgroup)
+ WT_ERR(__wt_schema_colgroup_source(
+ session, table, suffix, value, ns));
+ else
+ WT_ERR(__wt_schema_index_source(
+ session, table, suffix, value, ns));
+
+ if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "index or column group has no data source: %s", value);
+
+ /* Take a copy of the old data source. */
+ WT_ERR(__wt_scr_alloc(session, 0, &os));
+ WT_ERR(__wt_buf_fmt(session, os, "%.*s", (int)cval.len, cval.str));
+
+ /* Overwrite it with the new data source. */
+ WT_ERR(__wt_scr_alloc(session, 0, &nv));
+ WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s",
+ (int)WT_PTRDIFF(cval.str, value), value,
+ (const char *)ns->data,
+ cval.str + cval.len));
+
+ /*
+ * Remove the old metadata entry.
+ * Insert the new metadata entry.
+ */
+ WT_ERR(__wt_metadata_remove(session, name));
+ WT_ERR(__wt_metadata_insert(session, nn->data, nv->data));
+
+ /* Rename the file. */
+ WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg));
+
+err: __wt_scr_free(&nn);
+ __wt_scr_free(&ns);
+ __wt_scr_free(&nv);
+ __wt_scr_free(&os);
+ __wt_free(session, value);
+ table->name = olduri;
+ return (ret);
+}
+
+/*
+ * __metadata_rename --
+ * Rename an entry in the metadata table.
+ */
+static int
+__metadata_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri)
+{
+ WT_DECL_RET;
+ const char *value;
+
+ WT_RET(__wt_metadata_search(session, uri, &value));
+ WT_ERR(__wt_metadata_remove(session, uri));
+ WT_ERR(__wt_metadata_insert(session, newuri, value));
+
+err: __wt_free(session, value);
+ return (ret);
+}
+
+/*
+ * __rename_table --
+ * WT_SESSION::rename for a table.
+ */
+static int
+__rename_table(WT_SESSION_IMPL *session,
+ const char *uri, const char *newuri, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TABLE *table;
+ u_int i;
+ const char *oldname;
+
+ oldname = uri;
+ (void)WT_PREFIX_SKIP(oldname, "table:");
+
+ WT_RET(__wt_schema_get_table(
+ session, oldname, strlen(oldname), 0, &table));
+
+ /* Rename the column groups. */
+ for (i = 0; i < WT_COLGROUPS(table); i++)
+ WT_ERR(__rename_tree(session, table, newuri,
+ table->cgroups[i]->name, cfg));
+
+ /* Rename the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++)
+ WT_ERR(__rename_tree(session, table, newuri,
+ table->indices[i]->name, cfg));
+
+ __wt_schema_remove_table(session, table);
+ table = NULL;
+
+ /* Rename the table. */
+ WT_ERR(__metadata_rename(session, uri, newuri));
+
+err: if (table != NULL)
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __wt_schema_rename --
+ * WT_SESSION::rename.
+ */
+int
+__wt_schema_rename(WT_SESSION_IMPL *session,
+ const char *uri, const char *newuri, const char *cfg[])
+{
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ const char *p, *t;
+
+ /* The target type must match the source type. */
+ for (p = uri, t = newuri; *p == *t && *p != ':'; ++p, ++t)
+ ;
+ if (*p != ':' || *t != ':')
+ WT_RET_MSG(session, EINVAL,
+ "rename target type must match URI: %s to %s", uri, newuri);
+
+ /*
+ * We track rename operations, if we fail in the middle, we want to
+ * back it all out.
+ */
+ WT_RET(__wt_meta_track_on(session));
+
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ ret = __rename_file(session, uri, newuri);
+ else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_rename(session, uri, newuri, cfg);
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __rename_table(session, uri, newuri, cfg);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->rename == NULL ?
+ __wt_object_unsupported(session, uri) :
+ dsrc->rename(dsrc,
+ &session->iface, uri, newuri, (WT_CONFIG_ARG *)cfg);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ /* Bump the schema generation so that stale data is ignored. */
+ ++S2C(session)->schema_gen;
+
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ /* If we didn't find a metadata entry, map that error to ENOENT. */
+ return (ret == WT_NOTFOUND ? ENOENT : ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_stat.c b/src/third_party/wiredtiger/src/schema/schema_stat.c
new file mode 100644
index 00000000000..cb8e7f6c418
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_stat.c
@@ -0,0 +1,114 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_curstat_colgroup_init --
+ * Initialize the statistics for a column group.
+ */
+int
+__wt_curstat_colgroup_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_COLGROUP *colgroup;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+
+ WT_RET(__wt_schema_get_colgroup(session, uri, NULL, &colgroup));
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", colgroup->source));
+ ret = __wt_curstat_init(session, buf->data, cfg, cst);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_curstat_index_init --
+ * Initialize the statistics for an index.
+ */
+int
+__wt_curstat_index_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_INDEX *idx;
+
+ WT_RET(__wt_schema_get_index(session, uri, NULL, &idx));
+
+ WT_RET(__wt_scr_alloc(session, 0, &buf));
+ WT_ERR(__wt_buf_fmt(session, buf, "statistics:%s", idx->source));
+ ret = __wt_curstat_init(session, buf->data, cfg, cst);
+
+err: __wt_scr_free(&buf);
+ return (ret);
+}
+
+/*
+ * __wt_curstat_table_init --
+ * Initialize the statistics for a table.
+ */
+int
+__wt_curstat_table_init(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], WT_CURSOR_STAT *cst)
+{
+ WT_CURSOR *stat_cursor;
+ WT_DECL_ITEM(buf);
+ WT_DECL_RET;
+ WT_DSRC_STATS *new, *stats;
+ WT_TABLE *table;
+ u_int i;
+ const char *name;
+
+ name = uri + strlen("table:");
+ WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &buf));
+
+ /*
+ * Process the column groups.
+ *
+ * Set the cursor to reference the data source statistics; we don't
+ * initialize it, instead we copy (rather than aggregate), the first
+ * column's statistics, which has the same effect.
+ */
+ stats = &cst->u.dsrc_stats;
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ WT_ERR(__wt_buf_fmt(
+ session, buf, "statistics:%s", table->cgroups[i]->name));
+ WT_ERR(__wt_curstat_open(
+ session, buf->data, cfg, &stat_cursor));
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ if (i == 0)
+ *stats = *new;
+ else
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+ }
+
+ /* Process the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ WT_ERR(__wt_buf_fmt(
+ session, buf, "statistics:%s", table->indices[i]->name));
+ WT_ERR(__wt_curstat_open(
+ session, buf->data, cfg, &stat_cursor));
+ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
+ __wt_stat_aggregate_dsrc_stats(new, stats);
+ WT_ERR(stat_cursor->close(stat_cursor));
+ }
+
+ __wt_curstat_dsrc_final(cst);
+
+err: __wt_schema_release_table(session, table);
+
+ __wt_scr_free(&buf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_truncate.c b/src/third_party/wiredtiger/src/schema/schema_truncate.c
new file mode 100644
index 00000000000..1da3b103f10
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_truncate.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __truncate_file --
+ * WT_SESSION::truncate for a file.
+ */
+static int
+__truncate_file(WT_SESSION_IMPL *session, const char *name)
+{
+ const char *filename;
+ uint32_t allocsize;
+
+ filename = name;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ return (EINVAL);
+
+ /* Open and lock the file. */
+ WT_RET(__wt_session_get_btree(
+ session, name, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
+
+ /* Get the allocation size. */
+ allocsize = S2BT(session)->allocsize;
+
+ WT_RET(__wt_session_release_btree(session));
+
+ /* Close any btree handles in the file. */
+ WT_RET(__wt_conn_dhandle_close_all(session, name, 0));
+
+ /* Delete the root address and truncate the file. */
+ WT_RET(__wt_meta_checkpoint_clear(session, name));
+ WT_RET(__wt_block_manager_truncate(session, filename, allocsize));
+
+ return (0);
+}
+
+/*
+ * __truncate_table --
+ * WT_SESSION::truncate for a table.
+ */
+static int
+__truncate_table(WT_SESSION_IMPL *session, const char *name, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TABLE *table;
+ u_int i;
+
+ WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
+
+ /* Truncate the column groups. */
+ for (i = 0; i < WT_COLGROUPS(table); i++)
+ WT_ERR(__wt_schema_truncate(
+ session, table->cgroups[i]->source, cfg));
+
+ /* Truncate the indices. */
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++)
+ WT_ERR(__wt_schema_truncate(
+ session, table->indices[i]->source, cfg));
+
+err: __wt_schema_release_table(session, table);
+ return (ret);
+}
+
+/*
+ * __truncate_dsrc --
+ * WT_SESSION::truncate for a data-source without a truncate operation.
+ */
+static int
+__truncate_dsrc(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *cfg[2];
+
+ /* Open a cursor and traverse the object, removing every entry. */
+ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
+ cfg[1] = NULL;
+ WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
+ while ((ret = cursor->next(cursor)) == 0)
+ WT_ERR(cursor->remove(cursor));
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __wt_schema_truncate --
+ * WT_SESSION::truncate without a range.
+ */
+int
+__wt_schema_truncate(
+ WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ const char *tablename;
+
+ tablename = uri;
+
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ ret = __truncate_file(session, uri);
+ } else if (WT_PREFIX_MATCH(uri, "lsm:"))
+ ret = __wt_lsm_tree_truncate(session, uri, cfg);
+ else if (WT_PREFIX_SKIP(tablename, "table:"))
+ ret = __truncate_table(session, tablename, cfg);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ ret = dsrc->truncate == NULL ?
+ __truncate_dsrc(session, uri) :
+ dsrc->truncate(
+ dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg);
+ else
+ ret = __wt_bad_object_type(session, uri);
+
+ /* If we didn't find a metadata entry, map that error to ENOENT. */
+ return (ret == WT_NOTFOUND ? ENOENT : ret);
+}
+
+/*
+ * __wt_range_truncate --
+ * Truncate of a cursor range, default implementation.
+ */
+int
+__wt_range_truncate(WT_CURSOR *start, WT_CURSOR *stop)
+{
+ WT_DECL_RET;
+ int cmp;
+
+ if (start == NULL) {
+ do {
+ WT_RET(stop->remove(stop));
+ } while ((ret = stop->prev(stop)) == 0);
+ WT_RET_NOTFOUND_OK(ret);
+ } else {
+ cmp = -1;
+ do {
+ if (stop != NULL)
+ WT_RET(start->compare(start, stop, &cmp));
+ WT_RET(start->remove(start));
+ } while (cmp < 0 && (ret = start->next(start)) == 0);
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ return (0);
+}
+
+/*
+ * __wt_schema_range_truncate --
+ * WT_SESSION::truncate with a range.
+ */
+int
+__wt_schema_range_truncate(
+ WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop)
+{
+ WT_CURSOR *cursor;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ const char *uri;
+
+ cursor = (start != NULL) ? start : stop;
+ uri = cursor->internal_uri;
+
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree,
+ ret = __wt_btcur_range_truncate(
+ (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop));
+ else if (WT_PREFIX_MATCH(uri, "table:"))
+ ret = __wt_table_range_truncate(
+ (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop);
+ else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL &&
+ dsrc->range_truncate != NULL)
+ ret = dsrc->range_truncate(dsrc, &session->iface, start, stop);
+ else
+ ret = __wt_range_truncate(start, stop);
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_util.c b/src/third_party/wiredtiger/src/schema/schema_util.c
new file mode 100644
index 00000000000..263f56f1c41
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_util.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_get_source --
+ * Find a matching data source or report an error.
+ */
+WT_DATA_SOURCE *
+__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_NAMED_DATA_SOURCE *ndsrc;
+
+ TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q)
+ if (WT_PREFIX_MATCH(name, ndsrc->prefix))
+ return (ndsrc->dsrc);
+ return (NULL);
+}
+
+/*
+ * __wt_str_name_check --
+ * Disallow any use of the WiredTiger name space.
+ */
+int
+__wt_str_name_check(WT_SESSION_IMPL *session, const char *str)
+{
+ const char *name, *sep;
+ int skipped;
+
+ /*
+ * Check if name is somewhere in the WiredTiger name space: it would be
+ * "bad" if the application truncated the metadata file. Skip any
+ * leading URI prefix, check and then skip over a table name.
+ */
+ name = str;
+ for (skipped = 0; skipped < 2; skipped++) {
+ if ((sep = strchr(name, ':')) == NULL)
+ break;
+
+ name = sep + 1;
+ if (WT_PREFIX_MATCH(name, "WiredTiger"))
+ WT_RET_MSG(session, EINVAL,
+ "%s: the \"WiredTiger\" name space may not be "
+ "used by applications", name);
+ }
+
+ /*
+ * Disallow JSON quoting characters -- the config string parsing code
+ * supports quoted strings, but there's no good reason to use them in
+ * names and we're not going to do the testing.
+ */
+ if (strpbrk(name, "{},:[]\\\"'") != NULL)
+ WT_RET_MSG(session, EINVAL,
+ "%s: WiredTiger objects should not include grouping "
+ "characters in their names",
+ name);
+
+ return (0);
+}
+
+/*
+ * __wt_name_check --
+ * Disallow any use of the WiredTiger name space.
+ */
+int
+__wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len)
+{
+ WT_DECL_RET;
+ WT_DECL_ITEM(tmp);
+
+ WT_RET(__wt_scr_alloc(session, len, &tmp));
+
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)len, str));
+
+ ret = __wt_str_name_check(session, tmp->data);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c
new file mode 100644
index 00000000000..8e7ed3925f6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/schema/schema_worker.c
@@ -0,0 +1,134 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_schema_worker --
+ * Get Btree handles for the object and cycle through calls to an
+ * underlying worker function with each handle.
+ */
+int
+__wt_schema_worker(WT_SESSION_IMPL *session,
+ const char *uri,
+ int (*file_func)(WT_SESSION_IMPL *, const char *[]),
+ int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
+ const char *cfg[], uint32_t open_flags)
+{
+ WT_COLGROUP *colgroup;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_SESSION *wt_session;
+ WT_TABLE *table;
+ const char *tablename;
+ u_int i;
+ int skip;
+
+ table = NULL;
+ tablename = uri;
+
+ skip = 0;
+ if (name_func != NULL)
+ WT_ERR(name_func(session, uri, &skip));
+
+ /* If the callback said to skip this object, we're done. */
+ if (skip)
+ return (0);
+
+ /* Get the btree handle(s) and call the underlying function. */
+ if (WT_PREFIX_MATCH(uri, "file:")) {
+ if (file_func != NULL) {
+ /*
+ * If the operation requires exclusive access, close
+ * any open file handles, including checkpoints.
+ */
+ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE))
+ WT_ERR(__wt_conn_dhandle_close_all(
+ session, uri, 0));
+
+ WT_ERR(__wt_session_get_btree_ckpt(
+ session, uri, cfg, open_flags));
+ ret = file_func(session, cfg);
+ WT_TRET(__wt_session_release_btree(session));
+ }
+ } else if (WT_PREFIX_MATCH(uri, "colgroup:")) {
+ WT_ERR(__wt_schema_get_colgroup(session, uri, NULL, &colgroup));
+ WT_ERR(__wt_schema_worker(session, colgroup->source,
+ file_func, name_func, cfg, open_flags));
+ } else if (WT_PREFIX_SKIP(tablename, "index:")) {
+ idx = NULL;
+ WT_ERR(__wt_schema_get_index(session, uri, NULL, &idx));
+ WT_ERR(__wt_schema_worker(session, idx->source,
+ file_func, name_func, cfg, open_flags));
+ } else if (WT_PREFIX_MATCH(uri, "lsm:")) {
+ /*
+ * LSM compaction is handled elsewhere, but if we get here
+ * trying to compact files, don't descend into an LSM tree.
+ */
+ if (file_func != __wt_compact)
+ WT_ERR(__wt_lsm_tree_worker(session,
+ uri, file_func, name_func, cfg, open_flags));
+ } else if (WT_PREFIX_SKIP(tablename, "table:")) {
+ WT_ERR(__wt_schema_get_table(session,
+ tablename, strlen(tablename), 0, &table));
+ WT_ASSERT(session, session->dhandle == NULL);
+
+ /*
+ * We could make a recursive call for each colgroup or index
+ * URI, but since we have already opened the table, we can take
+ * a short cut and skip straight to the sources. If we have a
+ * name function, it needs to know about the intermediate URIs.
+ */
+ for (i = 0; i < WT_COLGROUPS(table); i++) {
+ colgroup = table->cgroups[i];
+ skip = 0;
+ if (name_func != NULL)
+ WT_ERR(name_func(
+ session, colgroup->name, &skip));
+ if (!skip)
+ WT_ERR(__wt_schema_worker(
+ session, colgroup->source,
+ file_func, name_func, cfg, open_flags));
+ }
+
+ WT_ERR(__wt_schema_open_indices(session, table));
+ for (i = 0; i < table->nindices; i++) {
+ idx = table->indices[i];
+ skip = 0;
+ if (name_func != NULL)
+ WT_ERR(name_func(session, idx->name, &skip));
+ if (!skip)
+ WT_ERR(__wt_schema_worker(session, idx->source,
+ file_func, name_func, cfg, open_flags));
+ }
+ } else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) {
+ wt_session = (WT_SESSION *)session;
+ if (file_func == __wt_compact && dsrc->compact != NULL)
+ WT_ERR(dsrc->compact(
+ dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+ else if (file_func == __wt_salvage && dsrc->salvage != NULL)
+ WT_ERR(dsrc->salvage(
+ dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+ else if (file_func == __wt_verify && dsrc->verify != NULL)
+ WT_ERR(dsrc->verify(
+ dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg));
+ else if (file_func == __wt_checkpoint)
+ ;
+ else if (file_func == __wt_checkpoint_list)
+ ;
+ else if (file_func == __wt_checkpoint_sync)
+ ;
+ else
+ WT_ERR(__wt_object_unsupported(session, uri));
+ } else
+ WT_ERR(__wt_bad_object_type(session, uri));
+
+err: if (table != NULL)
+ __wt_schema_release_table(session, table);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c
new file mode 100644
index 00000000000..39b9dd0de61
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_api.c
@@ -0,0 +1,1054 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static int __session_checkpoint(WT_SESSION *, const char *);
+static int __session_rollback_transaction(WT_SESSION *, const char *);
+
+/*
+ * __wt_session_reset_cursors --
+ * Reset all open cursors.
+ */
+int
+__wt_session_reset_cursors(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ TAILQ_FOREACH(cursor, &session->cursors, q) {
+ /* Stop when there are no positioned cursors. */
+ if (session->ncursors == 0)
+ break;
+ WT_TRET(cursor->reset(cursor));
+ }
+ return (ret);
+}
+
+/*
+ * __wt_session_copy_values --
+ * Copy values into all positioned cursors, so that they don't keep
+ * transaction IDs pinned.
+ */
+int
+__wt_session_copy_values(WT_SESSION_IMPL *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ TAILQ_FOREACH(cursor, &session->cursors, q)
+ if (F_ISSET(cursor, WT_CURSTD_VALUE_INT)) {
+ F_CLR(cursor, WT_CURSTD_VALUE_INT);
+ WT_RET(__wt_buf_set(session, &cursor->value,
+ cursor->value.data, cursor->value.size));
+ F_SET(cursor, WT_CURSTD_VALUE_EXT);
+ }
+
+ return (ret);
+}
+
+/*
+ * __session_clear --
+ * Clear a session structure.
+ */
+static void
+__session_clear(WT_SESSION_IMPL *session)
+{
+ /*
+ * There's no serialization support around the review of the hazard
+ * array, which means threads checking for hazard pointers first check
+ * the active field (which may be 0) and then use the hazard pointer
+ * (which cannot be NULL).
+ *
+ * Additionally, the session structure can include information that
+ * persists past the session's end-of-life, stored as part of page
+ * splits.
+ *
+ * For these reasons, be careful when clearing the session structure.
+ */
+ memset(session, 0, WT_SESSION_CLEAR_SIZE(session));
+ session->hazard_size = 0;
+ session->nhazard = 0;
+}
+
+/*
+ * __session_close --
+ * WT_SESSION->close method.
+ */
+static int
+__session_close(WT_SESSION *wt_session, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_session->connection;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, close, config, cfg);
+ WT_UNUSED(cfg);
+
+ /* Rollback any active transaction. */
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ WT_TRET(__session_rollback_transaction(wt_session, NULL));
+
+ /*
+ * Also release any pinned transaction ID from a non-transactional
+ * operation.
+ */
+ if (conn->txn_global.states != NULL)
+ __wt_txn_release_snapshot(session);
+
+ /* Close all open cursors. */
+ while ((cursor = TAILQ_FIRST(&session->cursors)) != NULL) {
+ /*
+ * Notify the user that we are closing the cursor handle
+ * via the registered close callback.
+ */
+ if (session->event_handler->handle_close != NULL)
+ WT_TRET(session->event_handler->handle_close(
+ session->event_handler, wt_session, cursor));
+ WT_TRET(cursor->close(cursor));
+ }
+
+ WT_ASSERT(session, session->ncursors == 0);
+
+ /* Discard cached handles. */
+ __wt_session_close_cache(session);
+
+ /* Close all tables. */
+ __wt_schema_close_tables(session);
+
+ /* Discard metadata tracking. */
+ __wt_meta_track_discard(session);
+
+ /* Discard scratch buffers. */
+ __wt_scr_discard(session);
+
+ /* Free transaction information. */
+ __wt_txn_destroy(session);
+
+ /* Confirm we're not holding any hazard pointers. */
+ __wt_hazard_close(session);
+
+ /* Cleanup */
+ if (session->block_manager_cleanup != NULL)
+ WT_TRET(session->block_manager_cleanup(session));
+ if (session->reconcile_cleanup != NULL)
+ WT_TRET(session->reconcile_cleanup(session));
+
+ /* Free the eviction exclusive-lock information. */
+ __wt_free(session, session->excl);
+
+ /* Destroy the thread's mutex. */
+ WT_TRET(__wt_cond_destroy(session, &session->cond));
+
+ /* The API lock protects opening and closing of sessions. */
+ __wt_spin_lock(session, &conn->api_lock);
+
+ /* Decrement the count of open sessions. */
+ WT_STAT_FAST_CONN_DECR(session, session_open);
+
+ /*
+ * Sessions are re-used, clear the structure: the clear sets the active
+ * field to 0, which will exclude the hazard array from review by the
+ * eviction thread. Because some session fields are accessed by other
+ * threads, the structure must be cleared carefully.
+ *
+ * We don't need to publish here, because regardless of the active field
+ * being non-zero, the hazard pointer is always valid.
+ */
+ __session_clear(session);
+ session = conn->default_session;
+
+ /*
+ * Decrement the count of active sessions if that's possible: a session
+ * being closed may or may not be at the end of the array, step toward
+ * the beginning of the array until we reach an active session.
+ */
+ while (conn->sessions[conn->session_cnt - 1].active == 0)
+ if (--conn->session_cnt == 0)
+ break;
+
+ __wt_spin_unlock(session, &conn->api_lock);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_reconfigure --
+ * WT_SESSION->reconfigure method.
+ */
+static int
+__session_reconfigure(WT_SESSION *wt_session, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, reconfigure, config, cfg);
+
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL, "transaction in progress");
+
+ WT_TRET(__wt_session_reset_cursors(session));
+
+ WT_ERR(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+ if (cval.len != 0)
+ session->isolation = session->txn.isolation =
+ WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+ TXN_ISO_SNAPSHOT :
+ WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
+ TXN_ISO_READ_UNCOMMITTED : TXN_ISO_READ_COMMITTED;
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_open_cursor --
+ * Internal version of WT_SESSION::open_cursor.
+ */
+int
+__wt_open_cursor(WT_SESSION_IMPL *session,
+ const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
+{
+ WT_COLGROUP *colgroup;
+ WT_DATA_SOURCE *dsrc;
+ WT_DECL_RET;
+
+ *cursorp = NULL;
+
+ /*
+ * Open specific cursor types we know about, or call the generic data
+ * source open function.
+ *
+ * Unwind a set of string comparisons into a switch statement hoping
+ * the compiler can make it fast, but list the common choices first
+ * instead of sorting so if/else patterns are still fast.
+ */
+ switch (uri[0]) {
+ /*
+ * Common cursor types.
+ */
+ case 't':
+ if (WT_PREFIX_MATCH(uri, "table:"))
+ WT_RET(__wt_curtable_open(session, uri, cfg, cursorp));
+ break;
+ case 'c':
+ if (WT_PREFIX_MATCH(uri, "colgroup:")) {
+ /*
+ * Column groups are a special case: open a cursor on
+ * the underlying data source.
+ */
+ WT_RET(__wt_schema_get_colgroup(
+ session, uri, NULL, &colgroup));
+ WT_RET(__wt_open_cursor(
+ session, colgroup->source, owner, cfg, cursorp));
+ } else if (WT_PREFIX_MATCH(uri, "config:"))
+ WT_RET(__wt_curconfig_open(
+ session, uri, cfg, cursorp));
+ break;
+ case 'i':
+ if (WT_PREFIX_MATCH(uri, "index:"))
+ WT_RET(__wt_curindex_open(
+ session, uri, owner, cfg, cursorp));
+ break;
+ case 'l':
+ if (WT_PREFIX_MATCH(uri, "lsm:"))
+ WT_RET(__wt_clsm_open(
+ session, uri, owner, cfg, cursorp));
+ else if (WT_PREFIX_MATCH(uri, "log:"))
+ WT_RET(__wt_curlog_open(session, uri, cfg, cursorp));
+ break;
+
+ /*
+ * Less common cursor types.
+ */
+ case 'f':
+ if (WT_PREFIX_MATCH(uri, "file:"))
+ WT_RET(__wt_curfile_open(
+ session, uri, owner, cfg, cursorp));
+ break;
+ case 'm':
+ if (WT_PREFIX_MATCH(uri, WT_METADATA_URI))
+ WT_RET(__wt_curmetadata_open(
+ session, uri, owner, cfg, cursorp));
+ break;
+ case 'b':
+ if (WT_PREFIX_MATCH(uri, "backup:"))
+ WT_RET(__wt_curbackup_open(
+ session, uri, cfg, cursorp));
+ break;
+ case 's':
+ if (WT_PREFIX_MATCH(uri, "statistics:"))
+ WT_RET(__wt_curstat_open(session, uri, cfg, cursorp));
+ break;
+ default:
+ break;
+ }
+
+ if (*cursorp == NULL &&
+ (dsrc = __wt_schema_get_source(session, uri)) != NULL)
+ WT_RET(dsrc->open_cursor == NULL ?
+ __wt_object_unsupported(session, uri) :
+ __wt_curds_open(session, uri, owner, cfg, dsrc, cursorp));
+
+ if (*cursorp == NULL)
+ return (__wt_bad_object_type(session, uri));
+
+ /*
+ * When opening simple tables, the table code calls this function on the
+ * underlying data source, in which case the application's URI has been
+ * copied.
+ */
+ if ((*cursorp)->uri == NULL &&
+ (ret = __wt_strdup(session, uri, &(*cursorp)->uri)) != 0)
+ WT_TRET((*cursorp)->close(*cursorp));
+
+ return (ret);
+}
+
+/*
+ * __session_open_cursor --
+ * WT_SESSION->open_cursor method.
+ */
+static int
+__session_open_cursor(WT_SESSION *wt_session,
+ const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ cursor = *cursorp = NULL;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, open_cursor, config, cfg);
+
+ if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL))
+ WT_ERR_MSG(session, EINVAL,
+ "should be passed either a URI or a cursor to duplicate, "
+ "but not both");
+
+ if (to_dup != NULL) {
+ uri = to_dup->uri;
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "lsm:") &&
+ !WT_PREFIX_MATCH(uri, WT_METADATA_URI) &&
+ !WT_PREFIX_MATCH(uri, "table:") &&
+ __wt_schema_get_source(session, uri) == NULL)
+ WT_ERR(__wt_bad_object_type(session, uri));
+ }
+
+ WT_ERR(__wt_open_cursor(session, uri, NULL, cfg, &cursor));
+ if (to_dup != NULL)
+ WT_ERR(__wt_cursor_dup_position(to_dup, cursor));
+
+ *cursorp = cursor;
+
+ if (0) {
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ }
+
+ /*
+ * Opening a cursor on a non-existent data source will set ret to
+ * either of ENOENT or WT_NOTFOUND at this point. However,
+ * applications may reasonably do this inside a transaction to check
+ * for the existence of a table or index.
+ *
+ * Prefer WT_NOTFOUND here: that does not force running transactions to
+ * roll back. It will be mapped back to ENOENT.
+ */
+ if (ret == ENOENT)
+ ret = WT_NOTFOUND;
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_session_create_strip --
+ * Discard any configuration information from a schema entry that is not
+ * applicable to an session.create call, here for the wt dump command utility,
+ * which only wants to dump the schema information needed for load.
+ */
+int
+__wt_session_create_strip(WT_SESSION *wt_session,
+ const char *v1, const char *v2, const char **value_ret)
+{
+ WT_SESSION_IMPL *session = (WT_SESSION_IMPL *)wt_session;
+ const char *cfg[] =
+ { WT_CONFIG_BASE(session, session_create), v1, v2, NULL };
+
+ return (__wt_config_collapse(session, cfg, value_ret));
+}
+
+/*
+ * __session_create --
+ * WT_SESSION->create method.
+ */
+static int
+__session_create(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, create, config, cfg);
+ WT_UNUSED(cfg);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ /*
+ * Type configuration only applies to tables, column groups and indexes.
+ * We don't want applications to attempt to layer LSM on top of their
+ * extended data-sources, and the fact we allow LSM as a valid URI is an
+ * invitation to that mistake: nip it in the bud.
+ */
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:")) {
+ /*
+ * We can't disallow type entirely, a configuration string might
+ * innocently include it, for example, a dump/load pair. If the
+ * URI type prefix and the type are the same, let it go.
+ */
+ if ((ret =
+ __wt_config_getones(session, config, "type", &cval)) == 0 &&
+ (strncmp(uri, cval.str, cval.len) != 0 ||
+ uri[cval.len] != ':'))
+ WT_ERR_MSG(session, EINVAL,
+ "%s: unsupported type configuration", uri);
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_create(session, uri, config));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_log_printf --
+ * WT_SESSION->log_printf method.
+ */
+static int
+__session_log_printf(WT_SESSION *wt_session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+ WT_SESSION_IMPL *session;
+ WT_DECL_RET;
+ va_list ap;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_NOCONF(session, log_printf);
+
+ va_start(ap, fmt);
+ ret = __wt_log_vprintf(session, fmt, ap);
+ va_end(ap);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_rename --
+ * WT_SESSION->rename method.
+ */
+static int
+__session_rename(WT_SESSION *wt_session,
+ const char *uri, const char *newuri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, rename, config, cfg);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+ WT_ERR(__wt_str_name_check(session, newuri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_rename(session, uri, newuri, cfg));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_compact --
+ * WT_SESSION->compact method.
+ */
+static int
+__session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_RET(__wt_str_name_check(session, uri));
+
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "lsm:") &&
+ !WT_PREFIX_MATCH(uri, "table:"))
+ return (__wt_bad_object_type(session, uri));
+
+ return (__wt_session_compact(wt_session, uri, config));
+}
+
+/*
+ * __session_drop --
+ * WT_SESSION->drop method.
+ */
+static int
+__session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, drop, config, cfg);
+
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_drop(session, uri, cfg));
+
+err: /* Note: drop operations cannot be unrolled (yet?). */
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_salvage --
+ * WT_SESSION->salvage method.
+ */
+static int
+__session_salvage(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, salvage, config, cfg);
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_salvage,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_truncate --
+ * WT_SESSION->truncate method.
+ */
+static int
+__session_truncate(WT_SESSION *wt_session,
+ const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_CURSOR *cursor;
+ int cmp;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_TXN_API_CALL(session, truncate, config, cfg);
+
+ /*
+ * If the URI is specified, we don't need a start/stop, if start/stop
+ * is specified, we don't need a URI.
+ *
+ * If no URI is specified, and both cursors are specified, start/stop
+ * must reference the same object.
+ *
+ * Any specified cursor must have been initialized.
+ */
+ if ((uri == NULL && start == NULL && stop == NULL) ||
+ (uri != NULL && (start != NULL || stop != NULL)))
+ WT_ERR_MSG(session, EINVAL,
+ "the truncate method should be passed either a URI or "
+ "start/stop cursors, but not both");
+
+ if (uri != NULL) {
+ /* Disallow objects in the WiredTiger name space. */
+ WT_ERR(__wt_str_name_check(session, uri));
+
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_truncate(session, uri, cfg));
+ goto done;
+ }
+
+ /*
+ * Cursor truncate is only supported for some objects, check for the
+ * supporting methods we need, range_truncate and compare.
+ */
+ cursor = start == NULL ? stop : start;
+ if (cursor->compare == NULL)
+ WT_ERR(__wt_bad_object_type(session, cursor->uri));
+
+ /*
+ * If both cursors set, check they're correctly ordered with respect to
+ * each other. We have to test this before any search, the search can
+ * change the initial cursor position.
+ *
+ * Rather happily, the compare routine will also confirm the cursors
+ * reference the same object and the keys are set.
+ */
+ if (start != NULL && stop != NULL) {
+ WT_ERR(start->compare(start, stop, &cmp));
+ if (cmp > 0)
+ WT_ERR_MSG(session, EINVAL,
+ "the start cursor position is after the stop "
+ "cursor position");
+ }
+
+ /*
+ * Truncate does not require keys actually exist so that applications
+ * can discard parts of the object's name space without knowing exactly
+ * what records currently appear in the object. For this reason, do a
+ * search-near, rather than a search. Additionally, we have to correct
+ * after calling search-near, to position the start/stop cursors on the
+ * next record greater than/less than the original key. If the cursors
+ * hit the beginning/end of the object, or the start/stop keys cross,
+ * we're done, the range must be empty.
+ */
+ if (start != NULL) {
+ WT_ERR(start->search_near(start, &cmp));
+ if (cmp < 0 && (ret = start->next(start)) != 0) {
+ WT_ERR_NOTFOUND_OK(ret);
+ goto done;
+ }
+ }
+ if (stop != NULL) {
+ WT_ERR(stop->search_near(stop, &cmp));
+ if (cmp > 0 && (ret = stop->prev(stop)) != 0) {
+ WT_ERR_NOTFOUND_OK(ret);
+ goto done;
+ }
+
+ if (start != NULL) {
+ WT_ERR(start->compare(start, stop, &cmp));
+ if (cmp > 0)
+ goto done;
+ }
+ }
+
+ WT_ERR(__wt_schema_range_truncate(session, start, stop));
+
+done:
+err: TXN_API_END_RETRY(session, ret, 0);
+ return ((ret) == WT_NOTFOUND ? ENOENT : (ret));
+}
+
+/*
+ * __session_upgrade --
+ * WT_SESSION->upgrade method.
+ */
+static int
+__session_upgrade(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, upgrade, config, cfg);
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_upgrade,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_UPGRADE));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_verify --
+ * WT_SESSION->verify method.
+ */
+static int
+__session_verify(WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ SESSION_API_CALL(session, verify, config, cfg);
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, uri, __wt_verify,
+ NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_VERIFY));
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __session_begin_transaction --
+ * WT_SESSION->begin_transaction method.
+ */
+static int
+__session_begin_transaction(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, begin_transaction, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, txn_begin);
+
+ if (F_ISSET(&session->txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL, "Transaction already running");
+
+ /*
+ * There is no transaction active in this thread; check if the cache is
+ * full, if we have to block for eviction, this is the best time to do
+ * it.
+ */
+ WT_ERR(__wt_cache_full_check(session));
+
+ ret = __wt_txn_begin(session, cfg);
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_commit_transaction --
+ * WT_SESSION->commit_transaction method.
+ */
+static int
+__session_commit_transaction(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, commit_transaction, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, txn_commit);
+
+ txn = &session->txn;
+ if (F_ISSET(txn, TXN_ERROR)) {
+ __wt_errx(session, "failed transaction requires rollback");
+ ret = EINVAL;
+ }
+
+ if (ret == 0)
+ ret = __wt_txn_commit(session, cfg);
+ else {
+ WT_TRET(__wt_session_reset_cursors(session));
+ WT_TRET(__wt_txn_rollback(session, cfg));
+ }
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_rollback_transaction --
+ * WT_SESSION->rollback_transaction method.
+ */
+static int
+__session_rollback_transaction(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, rollback_transaction, config, cfg);
+ WT_STAT_FAST_CONN_INCR(session, txn_rollback);
+
+ WT_TRET(__wt_session_reset_cursors(session));
+
+ WT_TRET(__wt_txn_rollback(session, cfg));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_transaction_pinned_range --
+ * WT_SESSION->transaction_pinned_range method.
+ */
+static int
+__session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_TXN_STATE *txn_state;
+ uint64_t pinned;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL_NOCONF(session, pinned_range);
+
+ txn_state = WT_SESSION_TXN_STATE(session);
+
+ /* Assign pinned to the lesser of id or snap_min */
+ if (txn_state->id != WT_TXN_NONE &&
+ TXNID_LT(txn_state->id, txn_state->snap_min))
+ pinned = txn_state->id;
+ else
+ pinned = txn_state->snap_min;
+
+ if (pinned == WT_TXN_NONE)
+ *prange = 0;
+ else
+ *prange = S2C(session)->txn_global.current - pinned;
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __session_checkpoint --
+ * WT_SESSION->checkpoint method.
+ */
+static int
+__session_checkpoint(WT_SESSION *wt_session, const char *config)
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ txn = &session->txn;
+
+ WT_STAT_FAST_CONN_INCR(session, txn_checkpoint);
+ SESSION_API_CALL(session, checkpoint, config, cfg);
+
+ /*
+ * Checkpoints require a snapshot to write a transactionally consistent
+ * snapshot of the data.
+ *
+ * We can't use an application's transaction: if it has uncommitted
+ * changes, they will be written in the checkpoint and may appear after
+ * a crash.
+ *
+ * Use a real snapshot transaction: we don't want any chance of the
+ * snapshot being updated during the checkpoint. Eviction is prevented
+ * from evicting anything newer than this because we track the oldest
+ * transaction ID in the system that is not visible to all readers.
+ */
+ if (F_ISSET(txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL,
+ "Checkpoint not permitted in a transaction");
+
+ /*
+ * Reset open cursors. Do this explicitly, even though it will happen
+ * implicitly in the call to begin_transaction for the checkpoint, the
+ * checkpoint code will acquire the schema lock before we do that, and
+ * some implementation of WT_CURSOR::reset might need the schema lock.
+ */
+ WT_ERR(__wt_session_reset_cursors(session));
+
+ /*
+ * Don't highjack the session checkpoint thread for eviction.
+ *
+ * Application threads are not generally available for potentially slow
+ * operations, but checkpoint does enough I/O it may be called upon to
+ * perform slow operations for the block manager.
+ */
+ F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+
+ /*
+ * Only one checkpoint can be active at a time, and checkpoints must run
+ * in the same order as they update the metadata. It's probably a bad
+ * idea to run checkpoints out of multiple threads, but serialize them
+ * here to ensure we don't get into trouble.
+ */
+ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 1);
+ __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
+
+ ret = __wt_txn_checkpoint(session, cfg);
+
+ WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
+ __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+
+err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
+
+/*
+ * __wt_open_internal_session --
+ * Allocate a session for WiredTiger's use.
+ */
+int
+__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name,
+ int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp)
+{
+ WT_SESSION_IMPL *session;
+
+ *sessionp = NULL;
+
+ WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+ session->name = name;
+
+ /*
+ * Public sessions are automatically closed during WT_CONNECTION->close.
+ * If the session handles for internal threads were to go on the public
+ * list, there would be complex ordering issues during close. Set a
+ * flag to avoid this: internal sessions are not closed automatically.
+ */
+ F_SET(session, WT_SESSION_INTERNAL);
+
+ /*
+ * Some internal threads must keep running after we close all data
+ * handles. Make sure these threads don't open their own handles.
+ */
+ if (!uses_dhandles)
+ F_SET(session, WT_SESSION_NO_DATA_HANDLES);
+
+ /*
+ * Acquiring the metadata handle requires the schema lock; we've seen
+ * problems in the past where a worker thread has acquired the schema
+ * lock unexpectedly, relatively late in the run, and deadlocked. Be
+ * defensive, get it now. The metadata file may not exist when the
+ * connection first creates its default session or the shared cache
+ * pool creates its sessions, let our caller decline this work.
+ */
+ if (open_metadata) {
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+ WT_RET(__wt_metadata_open(session));
+ }
+
+ *sessionp = session;
+ return (0);
+}
+
+/*
+ * __wt_open_session --
+ * Allocate a session handle. The internal parameter is used for sessions
+ * opened by WiredTiger for its own use.
+ */
+int
+__wt_open_session(WT_CONNECTION_IMPL *conn,
+ WT_EVENT_HANDLER *event_handler, const char *config,
+ WT_SESSION_IMPL **sessionp)
+{
+ static const WT_SESSION stds = {
+ NULL,
+ __session_close,
+ __session_reconfigure,
+ __session_open_cursor,
+ __session_create,
+ __session_compact,
+ __session_drop,
+ __session_log_printf,
+ __session_rename,
+ __session_salvage,
+ __session_truncate,
+ __session_upgrade,
+ __session_verify,
+ __session_begin_transaction,
+ __session_commit_transaction,
+ __session_rollback_transaction,
+ __session_checkpoint,
+ __session_transaction_pinned_range
+ };
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session, *session_ret;
+ uint32_t i;
+
+ *sessionp = NULL;
+
+ session = conn->default_session;
+ session_ret = NULL;
+
+ __wt_spin_lock(session, &conn->api_lock);
+
+ /*
+ * Make sure we don't try to open a new session after the application
+ * closes the connection. This is particularly intended to catch
+ * cases where server threads open sessions.
+ */
+ WT_ASSERT(session, F_ISSET(conn, WT_CONN_SERVER_RUN));
+
+ /* Find the first inactive session slot. */
+ for (session_ret = conn->sessions,
+ i = 0; i < conn->session_size; ++session_ret, ++i)
+ if (!session_ret->active)
+ break;
+ if (i == conn->session_size)
+ WT_ERR_MSG(session, ENOMEM,
+ "only configured to support %" PRIu32 " sessions"
+ " (including %" PRIu32 " internal)",
+ conn->session_size, WT_NUM_INTERNAL_SESSIONS);
+
+ /*
+ * If the active session count is increasing, update it. We don't worry
+ * about correcting the session count on error, as long as we don't mark
+ * this session as active, we'll clean it up on close.
+ */
+ if (i >= conn->session_cnt) /* Defend against off-by-one errors. */
+ conn->session_cnt = i + 1;
+
+ session_ret->id = i;
+ session_ret->iface = stds;
+ session_ret->iface.connection = &conn->iface;
+
+ WT_ERR(__wt_cond_alloc(session, "session", 0, &session_ret->cond));
+
+ __wt_random_init(session_ret->rnd);
+
+ __wt_event_handler_set(session_ret,
+ event_handler == NULL ? session->event_handler : event_handler);
+
+ TAILQ_INIT(&session_ret->cursors);
+ SLIST_INIT(&session_ret->dhandles);
+
+ /* Initialize transaction support: default to read-committed. */
+ session_ret->isolation = TXN_ISO_READ_COMMITTED;
+ WT_ERR(__wt_txn_init(session_ret));
+
+ /*
+ * The session's hazard pointer memory isn't discarded during normal
+ * session close because access to it isn't serialized. Allocate the
+ * first time we open this session.
+ */
+ if (session_ret->hazard == NULL)
+ WT_ERR(__wt_calloc_def(
+ session, conn->hazard_max, &session_ret->hazard));
+
+ /*
+ * Set an initial size for the hazard array. It will be grown as
+ * required up to hazard_max. The hazard_size is reset on close, since
+ * __wt_hazard_close ensures the array is cleared - so it is safe to
+ * reset the starting size on each open.
+ */
+ session_ret->hazard_size = WT_HAZARD_INCR;
+
+ /*
+ * Configuration: currently, the configuration for open_session is the
+ * same as session.reconfigure, so use that function.
+ */
+ if (config != NULL)
+ WT_ERR(
+ __session_reconfigure((WT_SESSION *)session_ret, config));
+
+ session_ret->name = NULL;
+
+ /*
+ * Publish: make the entry visible to server threads. There must be a
+ * barrier for two reasons, to ensure structure fields are set before
+ * any other thread will consider the session, and to push the session
+ * count to ensure the eviction thread can't review too few slots.
+ */
+ WT_PUBLISH(session_ret->active, 1);
+
+ WT_STATIC_ASSERT(offsetof(WT_SESSION_IMPL, iface) == 0);
+ *sessionp = session_ret;
+
+ WT_STAT_FAST_CONN_INCR(session, session_open);
+
+err: __wt_spin_unlock(session, &conn->api_lock);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_compact.c b/src/third_party/wiredtiger/src/session/session_compact.c
new file mode 100644
index 00000000000..6eca8a58d13
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_compact.c
@@ -0,0 +1,236 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * Compaction is the place where the underlying block manager becomes visible
+ * in the higher engine btree and API layers. As there is currently only one
+ * block manager, this code is written with it in mind: other block managers
+ * may need changes to support compaction, and a smart block manager might need
+ * far less support from the engine.
+ *
+ * First, the default block manager cannot entirely own compaction because it
+ * has no way to find a block after it moves other than a request from the
+ * btree layer with the new address. In other words, if internal page X points
+ * to leaf page Y, and page Y moves, the address of page Y has to be updated in
+ * page X. Generally, this is solved by building a translation layer in the
+ * block manager so internal pages don't require updates to relocate blocks:
+ * however, the translation table must be durable, has its own garbage
+ * collection issues and might be slower, all of which have their own problems.
+ *
+ * Second, the btree layer cannot entirely own compaction because page
+ * addresses are opaque, it cannot know where a page is in the file from the
+ * address cookie.
+ *
+ * For these reasons, compaction is a cooperative process between the btree
+ * layer and the block manager. The btree layer walks files, and asks the
+ * block manager if rewriting a particular block would reduce the file
+ * footprint: if writing the page will help, the page is marked dirty so it
+ * will eventually be written. As pages are written, the original page
+ * potentially becomes available for reuse and if enough pages at the end of
+ * the file are available for reuse, the file can be truncated, and compaction
+ * succeeds.
+ *
+ * However, writing a page is not by itself sufficient to make a page available
+ * for reuse. The original version of the page is still referenced by at least
+ * the most recent checkpoint in the file. To make a page available for reuse,
+ * we have to checkpoint the file so we can discard the checkpoint referencing
+ * the original version of the block; once no checkpoint references a block, it
+ * becomes available for reuse.
+ *
+ * Compaction is not necessarily possible in WiredTiger, even in a file with
+ * lots of available space. If a block at the end of the file is referenced by
+ * a named checkpoint, there is nothing we can do to compact the file, no
+ * matter how many times we rewrite the block, the named checkpoint can't be
+ * discarded and so the reference count on the original block will never go to
+ * zero. What's worse, because the block manager doesn't reference count
+ * blocks, it can't easily know this is the case, and so we'll waste a lot of
+ * effort trying to compact files that can't be compacted.
+ *
+ * Now, to the actual process. First, we checkpoint the high-level object
+ * (which is potentially composed of multiple files): there are potentially
+ * many dirty blocks in the cache, and we want to write them out and then
+ * discard previous checkpoints so we have as many blocks as possible on the
+ * file's "available for reuse" list when we start compaction.
+ *
+ * Then, we compact the high-level object.
+ *
+ * Compacting the object is done 10% at a time, that is, we try and move blocks
+ * from the last 10% of the file into the beginning of the file (the 10% is
+ * hard coded in the block manager). The reason for this is because we are
+ * walking the file in logical order, not block offset order, and we can fail
+ * to compact a file if we write the wrong blocks first.
+ *
+ * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
+ * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
+ * file. If we were to rewrite blocks from more than the last 10% of the file,
+ * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
+ * 10 of them without ever rewriting the blocks from the end of the file which
+ * would allow us to compact the file. So, we compact the last 10% of the
+ * file, and if that works, we compact the last 10% of the file again, and so
+ * on. Note the block manager uses a first-fit block selection algorithm
+ * during compaction to maximize block movement.
+ *
+ * After each 10% compaction, we checkpoint two more times (seriously, twice).
+ * The second and third checkpoints are because the block manager checkpoints
+ * in two steps: blocks made available for reuse during a checkpoint are put on
+ * a special checkpoint-available list and only moved to the real available
+ * list after the metadata has been updated with the new checkpoint's
+ * information. (Otherwise it is possible to allocate a rewritten block, crash
+ * before the metadata is updated, and see corruption.) For this reason,
+ * blocks allocated to write the checkpoint itself cannot be taken from the
+ * blocks made available by the checkpoint.
+ *
+ * To say it another way, the second checkpoint puts the blocks from the end of
+ * the file that were made available by compaction onto the checkpoint-available
+ * list, but then potentially writes the checkpoint itself at the end of the
+ * file, which would prevent any file truncation. When the metadata is updated
+ * for the second checkpoint, the blocks freed by compaction become available
+ * for the third checkpoint, so the third checkpoint's blocks are written
+ * towards the beginning of the file, and then the file can be truncated.
+ */
+
+/*
+ * __wt_compact_uri_analyze --
+ * Extract information relevant to deciding what work compact needs to
+ * do from a URI that is part of a table schema.
+ * Called via the schema_worker function.
+ */
+int
+__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip)
+{
+ /*
+ * Add references to schema URI objects to the list of objects to be
+ * compacted. Skip over LSM trees or we will get false positives on
+ * the "file:" URIs for the chunks.
+ */
+ if (WT_PREFIX_MATCH(uri, "lsm:")) {
+ session->compact->lsm_count++;
+ *skip = 1;
+ } else if (WT_PREFIX_MATCH(uri, "file:"))
+ session->compact->file_count++;
+
+ return (0);
+}
+
+/*
+ * __session_compact_check_timeout --
+ * Check if the timeout has been exceeded.
+ */
+static int
+__session_compact_check_timeout(
+ WT_SESSION_IMPL *session, struct timespec begin)
+{
+ struct timespec end;
+
+ if (session->compact->max_time == 0)
+ return (0);
+
+ WT_RET(__wt_epoch(session, &end));
+ if (session->compact->max_time <
+ WT_TIMEDIFF(end, begin) / WT_BILLION)
+ WT_RET(ETIMEDOUT);
+ return (0);
+}
+
+/*
+ * __compact_file --
+ * Function to alternate between checkpoints and compaction calls.
+ */
+static int
+__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_DECL_ITEM(t);
+ WT_SESSION *wt_session;
+ WT_TXN *txn;
+ int i;
+ struct timespec start_time;
+
+ txn = &session->txn;
+ wt_session = &session->iface;
+
+ /*
+ * File compaction requires checkpoints, which will fail in a
+ * transactional context. Check now so the error message isn't
+ * confusing.
+ */
+ if (session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING))
+ WT_ERR_MSG(session, EINVAL,
+ " File compaction not permitted in a transaction");
+
+ /*
+ * Force the checkpoint: we don't want to skip it because the work we
+ * need to have done is done in the underlying block manager.
+ */
+ WT_ERR(__wt_scr_alloc(session, 128, &t));
+ WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri));
+
+ WT_ERR(__wt_epoch(session, &start_time));
+
+ /*
+ * We compact 10% of the file on each pass, try 10 times (which is
+ * probably overkill), and quit if we make no progress. Check for a
+ * timeout each time through the loop.
+ */
+ for (i = 0; i < 10; ++i) {
+ WT_ERR(wt_session->checkpoint(wt_session, t->data));
+
+ session->compaction = 0;
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(
+ session, uri, __wt_compact, NULL, cfg, 0));
+ WT_ERR(ret);
+ if (!session->compaction)
+ break;
+
+ WT_ERR(wt_session->checkpoint(wt_session, t->data));
+ WT_ERR(wt_session->checkpoint(wt_session, t->data));
+ WT_ERR(__session_compact_check_timeout(session, start_time));
+ }
+
+err: __wt_scr_free(&t);
+ return (ret);
+}
+
+/*
+ * __wt_session_compact --
+ */
+int
+__wt_session_compact(
+ WT_SESSION *wt_session, const char *uri, const char *config)
+{
+ WT_COMPACT compact;
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ SESSION_API_CALL(session, compact, config, cfg);
+
+ /* Setup the structure in the session handle */
+ memset(&compact, 0, sizeof(WT_COMPACT));
+ session->compact = &compact;
+
+ WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval));
+ session->compact->max_time = (uint64_t)cval.val;
+
+ /* Find the types of data sources are being compacted. */
+ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(
+ session, uri, NULL, __wt_compact_uri_analyze, cfg, 0));
+ WT_ERR(ret);
+
+ if (session->compact->lsm_count != 0)
+ WT_ERR(__wt_schema_worker(
+ session, uri, NULL, __wt_lsm_compact, cfg, 0));
+ if (session->compact->file_count != 0)
+ WT_ERR(__compact_file(session, uri, cfg));
+
+err: session->compact = NULL;
+ API_END_RET_NOTFOUND_MAP(session, ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_dhandle.c b/src/third_party/wiredtiger/src/session/session_dhandle.c
new file mode 100644
index 00000000000..0c07e5fa259
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_dhandle.c
@@ -0,0 +1,478 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_session_dhandle_incr_use --
+ * Increment the session data source's in-use counter.
+ */
+void
+__wt_session_dhandle_incr_use(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+
+ dhandle = session->dhandle;
+
+ (void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1);
+}
+
+/*
+ * __session_dhandle_decr_use --
+ * Decrement the session data source's in-use counter.
+ */
+static int
+__session_dhandle_decr_use(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+
+ /*
+ * Decrement the in-use count on the underlying data-source -- if we're
+ * the last reference, set the time-of-death timestamp.
+ */
+ WT_ASSERT(session, dhandle->session_inuse > 0);
+ if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0)
+ WT_TRET(__wt_seconds(session, &dhandle->timeofdeath));
+ return (0);
+}
+
+/*
+ * __session_add_btree --
+ * Add a handle to the session's cache.
+ */
+static int
+__session_add_btree(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE **dhandle_cachep)
+{
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+
+ WT_RET(__wt_calloc_def(session, 1, &dhandle_cache));
+ dhandle_cache->dhandle = session->dhandle;
+
+ SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l);
+
+ if (dhandle_cachep != NULL)
+ *dhandle_cachep = dhandle_cache;
+
+ return (0);
+}
+
+/*
+ * __wt_session_lock_btree --
+ * Lock a btree handle.
+ */
+int
+__wt_session_lock_btree(WT_SESSION_IMPL *session, uint32_t flags)
+{
+ enum { NOLOCK, READLOCK, WRITELOCK } locked;
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ uint32_t special_flags;
+
+ btree = S2BT(session);
+ dhandle = session->dhandle;
+ locked = NOLOCK;
+
+ /*
+ * Special operation flags will cause the handle to be reopened.
+ * For example, a handle opened with WT_BTREE_BULK cannot use the same
+ * internal data structures as a handle opened for ordinary access.
+ */
+ special_flags = LF_ISSET(WT_BTREE_SPECIAL_FLAGS);
+ WT_ASSERT(session,
+ special_flags == 0 || LF_ISSET(WT_DHANDLE_EXCLUSIVE));
+
+ if (LF_ISSET(WT_DHANDLE_EXCLUSIVE)) {
+ /*
+ * Try to get an exclusive handle lock and fail immediately if
+ * it's unavailable. We don't expect exclusive operations on
+ * trees to be mixed with ordinary cursor access, but if there
+ * is a use case in the future, we could make blocking here
+ * configurable.
+ *
+ * Special flags will cause the handle to be reopened, which
+ * will get the necessary lock, so don't bother here.
+ */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || special_flags == 0) {
+ WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
+ locked = WRITELOCK;
+ }
+ } else if (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS))
+ return (EBUSY);
+ else {
+ WT_RET(__wt_readlock(session, dhandle->rwlock));
+ locked = READLOCK;
+ }
+
+ /*
+ * At this point, we have the requested lock -- if that is all that was
+ * required, we're done. Otherwise, check that the handle is open and
+ * that no special flags are required.
+ */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+ (F_ISSET(dhandle, WT_DHANDLE_OPEN) && special_flags == 0))
+ return (0);
+
+ /*
+ * The handle needs to be opened. If we locked the handle above,
+ * unlock it before returning.
+ */
+ switch (locked) {
+ case NOLOCK:
+ break;
+ case READLOCK:
+ WT_RET(__wt_readunlock(session, dhandle->rwlock));
+ break;
+ case WRITELOCK:
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+ WT_RET(__wt_writeunlock(session, dhandle->rwlock));
+ break;
+ }
+
+ /* Treat an unopened handle just like a non-existent handle. */
+ return (WT_NOTFOUND);
+}
+
+/*
+ * __wt_session_release_btree --
+ * Unlock a btree handle.
+ */
+int
+__wt_session_release_btree(WT_SESSION_IMPL *session)
+{
+ enum { NOLOCK, READLOCK, WRITELOCK } locked;
+ WT_BTREE *btree;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ btree = S2BT(session);
+ dhandle = session->dhandle;
+
+ /*
+ * Decrement the data-source's in-use counter. We ignore errors because
+ * they're insignificant and handling them complicates error handling in
+ * this function more than I'm willing to live with.
+ */
+ (void)__session_dhandle_decr_use(session);
+
+ locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ? WRITELOCK : READLOCK;
+ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_CLOSE)) {
+ /*
+ * If configured to discard on last close, trade any read lock
+ * for an exclusive lock. If the exchange succeeds, setup for
+ * discard. It is expected acquiring an exclusive lock will fail
+ * sometimes since the handle may still be in use: in that case
+ * we're done.
+ */
+ if (locked == READLOCK) {
+ locked = NOLOCK;
+ WT_ERR(__wt_readunlock(session, dhandle->rwlock));
+ ret = __wt_try_writelock(session, dhandle->rwlock);
+ if (ret != 0) {
+ if (ret == EBUSY)
+ ret = 0;
+ goto err;
+ }
+ locked = WRITELOCK;
+ F_CLR(dhandle, WT_DHANDLE_DISCARD_CLOSE);
+ F_SET(dhandle,
+ WT_DHANDLE_DISCARD | WT_DHANDLE_EXCLUSIVE);
+ }
+ }
+
+ /*
+ * If we had special flags set, close the handle so that future access
+ * can get a handle without special flags.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) ||
+ F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+ WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE));
+ F_CLR(dhandle, WT_DHANDLE_DISCARD);
+
+ WT_TRET(__wt_conn_btree_sync_and_close(session, 0));
+ }
+
+ if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE))
+ F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE);
+
+err: switch (locked) {
+ case NOLOCK:
+ break;
+ case READLOCK:
+ WT_TRET(__wt_readunlock(session, dhandle->rwlock));
+ break;
+ case WRITELOCK:
+ WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ break;
+ }
+
+ session->dhandle = NULL;
+ return (ret);
+}
+
+/*
+ * __wt_session_get_btree_ckpt --
+ * Check the configuration strings for a checkpoint name, get a btree
+ * handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree_ckpt(WT_SESSION_IMPL *session,
+ const char *uri, const char *cfg[], uint32_t flags)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ int last_ckpt;
+ const char *checkpoint;
+
+ last_ckpt = 0;
+ checkpoint = NULL;
+
+ /*
+ * This function exists to handle checkpoint configuration. Callers
+ * that never open a checkpoint call the underlying function directly.
+ */
+ WT_RET_NOTFOUND_OK(
+ __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
+ if (cval.len != 0) {
+ /*
+ * The internal checkpoint name is special, find the last
+ * unnamed checkpoint of the object.
+ */
+ if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
+ last_ckpt = 1;
+retry: WT_RET(__wt_meta_checkpoint_last_name(
+ session, uri, &checkpoint));
+ } else
+ WT_RET(__wt_strndup(
+ session, cval.str, cval.len, &checkpoint));
+ }
+
+ ret = __wt_session_get_btree(session, uri, checkpoint, cfg, flags);
+
+ __wt_free(session, checkpoint);
+
+ /*
+ * There's a potential race: we get the name of the most recent unnamed
+ * checkpoint, but if it's discarded (or locked so it can be discarded)
+ * by the time we try to open it, we'll fail the open. Retry in those
+ * cases, a new "last" checkpoint should surface, and we can't return an
+ * error, the application will be justifiably upset if we can't open the
+ * last checkpoint instance of an object.
+ *
+ * The check against WT_NOTFOUND is correct: if there was no checkpoint
+ * for the object (that is, the object has never been in a checkpoint),
+ * we returned immediately after the call to search for that name.
+ */
+ if (last_ckpt && (ret == WT_NOTFOUND || ret == EBUSY))
+ goto retry;
+ return (ret);
+}
+
+/*
+ * __session_discard_btree --
+ * Discard our reference to the btree.
+ */
+static void
+__session_discard_btree(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
+{
+ WT_DATA_HANDLE *saved_dhandle;
+
+ SLIST_REMOVE(
+ &session->dhandles, dhandle_cache, __wt_data_handle_cache, l);
+
+ saved_dhandle = session->dhandle;
+ session->dhandle = dhandle_cache->dhandle;
+
+ __wt_overwrite_and_free(session, dhandle_cache);
+ __wt_conn_btree_close(session);
+
+ /* Restore the original handle in the session. */
+ session->dhandle = saved_dhandle;
+}
+
+/*
+ * __wt_session_close_cache --
+ * Close any cached handles in a session.
+ */
+void
+__wt_session_close_cache(WT_SESSION_IMPL *session)
+{
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+
+ while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL)
+ __session_discard_btree(session, dhandle_cache);
+}
+
+/*
+ * __session_dhandle_sweep --
+ * Discard any session dhandles that are not open.
+ */
+static int
+__session_dhandle_sweep(WT_SESSION_IMPL *session, uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE_CACHE *dhandle_cache, *dhandle_cache_next;
+ time_t now;
+
+ /*
+ * Check the local flag WT_DHANDLE_LOCK_ONLY; a common caller with that
+ * flag is in the path to discard the handle, don't sweep in that case.
+ */
+ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY))
+ return (0);
+
+ /*
+ * Periodically sweep for dead handles; if we've swept recently, don't
+ * do it again.
+ */
+ WT_RET(__wt_seconds(session, &now));
+ if (now - session->last_sweep < WT_DHANDLE_SWEEP_PERIOD)
+ return (0);
+ session->last_sweep = now;
+
+ WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps);
+
+ dhandle_cache = SLIST_FIRST(&session->dhandles);
+ while (dhandle_cache != NULL) {
+ dhandle_cache_next = SLIST_NEXT(dhandle_cache, l);
+ dhandle = dhandle_cache->dhandle;
+ if (dhandle != session->dhandle &&
+ dhandle->session_inuse == 0 &&
+ now - dhandle->timeofdeath > WT_DHANDLE_SWEEP_WAIT) {
+ WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
+ __session_discard_btree(session, dhandle_cache);
+ }
+ dhandle_cache = dhandle_cache_next;
+ }
+ return (0);
+}
+
+/*
+ * __wt_session_get_btree --
+ * Get a btree handle for the given name, set session->dhandle.
+ */
+int
+__wt_session_get_btree(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+ WT_DECL_RET;
+ uint64_t hash;
+ int candidate;
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
+
+ dhandle = NULL;
+ candidate = 0;
+
+ hash = __wt_hash_city64(uri, strlen(uri));
+ SLIST_FOREACH(dhandle_cache, &session->dhandles, l) {
+ dhandle = dhandle_cache->dhandle;
+ if (hash != dhandle->name_hash ||
+ strcmp(uri, dhandle->name) != 0)
+ continue;
+ if (checkpoint == NULL && dhandle->checkpoint == NULL)
+ break;
+ if (checkpoint != NULL && dhandle->checkpoint != NULL &&
+ strcmp(checkpoint, dhandle->checkpoint) == 0)
+ break;
+ }
+
+ if (dhandle_cache != NULL) {
+ candidate = 1;
+ /* We found the data handle, don't try to get it again. */
+ LF_SET(WT_DHANDLE_HAVE_REF);
+ session->dhandle = dhandle;
+
+ /*
+ * Try to lock the file; if we succeed, our "exclusive" state
+ * must match.
+ */
+ ret = __wt_session_lock_btree(session, flags);
+ if (ret == WT_NOTFOUND)
+ dhandle_cache = NULL;
+ else
+ WT_RET(ret);
+ }
+
+ if (dhandle_cache == NULL) {
+ /* Sweep the handle list to remove any dead handles. */
+ WT_RET(__session_dhandle_sweep(session, flags));
+
+ /*
+ * Acquire the schema lock if we don't already hold it, find
+ * and/or open the handle.
+ */
+ WT_WITH_SCHEMA_LOCK(session, ret =
+ __wt_conn_btree_get(session, uri, checkpoint, cfg, flags));
+ WT_RET(ret);
+
+ if (!candidate)
+ WT_RET(__session_add_btree(session, NULL));
+ WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) ||
+ F_ISSET(session->dhandle, WT_DHANDLE_OPEN));
+ }
+
+ /* Increment the data-source's in-use counter. */
+ __wt_session_dhandle_incr_use(session);
+
+ WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) ==
+ F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE));
+ F_SET(session->dhandle, LF_ISSET(WT_DHANDLE_DISCARD_CLOSE));
+
+ return (0);
+}
+
+/*
+ * __wt_session_lock_checkpoint --
+ * Lock the btree handle for the given checkpoint name.
+ */
+int
+__wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint)
+{
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
+ WT_DECL_RET;
+
+ saved_dhandle = session->dhandle;
+
+ /*
+ * Get the checkpoint handle exclusive, so no one else can access it
+ * while we are creating the new checkpoint.
+ */
+ WT_ERR(__wt_session_get_btree(session, saved_dhandle->name,
+ checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));
+
+ /*
+ * Flush any pages in this checkpoint from the cache (we are about to
+ * re-write the checkpoint which will mean cached pages no longer have
+ * valid contents). This is especially noticeable with memory mapped
+ * files, since changes to the underlying file are visible to the in
+ * memory pages.
+ */
+ WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+
+ /*
+ * We lock checkpoint handles that we are overwriting, so the handle
+ * must be closed when we release it.
+ */
+ dhandle = session->dhandle;
+ F_SET(dhandle, WT_DHANDLE_DISCARD);
+
+ WT_ASSERT(session, WT_META_TRACKING(session));
+ WT_ERR(__wt_meta_track_handle_lock(session, 0));
+
+ /* Restore the original btree in the session. */
+err: session->dhandle = saved_dhandle;
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/session/session_salvage.c b/src/third_party/wiredtiger/src/session/session_salvage.c
new file mode 100644
index 00000000000..1512c6515ec
--- /dev/null
+++ b/src/third_party/wiredtiger/src/session/session_salvage.c
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_salvage --
+ * Salvage a single file.
+ */
+int
+__wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CKPT *ckptbase;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+
+ dhandle = session->dhandle;
+
+ /*
+ * XXX
+ * The salvage process reads and discards previous checkpoints, so the
+ * underlying block manager has to ignore any previous checkpoint
+ * entries when creating a new checkpoint, in other words, we can't use
+ * the metadata checkpoint list, it has all of those checkpoint listed
+ * and we don't care about them. Build a clean checkpoint list and use
+ * it instead.
+ *
+ * Don't first clear the metadata checkpoint list and call the function
+ * to get a list of checkpoints: a crash between clearing the metadata
+ * checkpoint list and creating a new checkpoint list would look like a
+ * create or open of a file without a checkpoint to roll-forward from,
+ * and the contents of the file would be discarded.
+ */
+ WT_RET(__wt_calloc_def(session, 2, &ckptbase));
+ WT_ERR(__wt_strdup(session, WT_CHECKPOINT, &ckptbase[0].name));
+ F_SET(&ckptbase[0], WT_CKPT_ADD);
+
+ WT_ERR(__wt_bt_salvage(session, ckptbase, cfg));
+
+ /*
+ * If no checkpoint was created, well, it's probably bad news, but there
+ * is nothing to do but clear any recorded checkpoints for the file. If
+ * a checkpoint was created, life is good, replace any existing list of
+ * checkpoints with the single new one.
+ */
+ if (ckptbase[0].raw.data == NULL)
+ WT_ERR(__wt_meta_checkpoint_clear(session, dhandle->name));
+ else
+ WT_ERR(__wt_meta_ckptlist_set(
+ session, dhandle->name, ckptbase, NULL));
+
+err: __wt_meta_ckptlist_free(session, ckptbase);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/support/cksum.c b/src/third_party/wiredtiger/src/support/cksum.c
new file mode 100644
index 00000000000..1eaa345d1fe
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/cksum.c
@@ -0,0 +1,1306 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * This file contains two implementations for computing CRC: one that uses
+ * hardware CRC instructions, available on newer x86_64/amd64, and one that uses
+ * a fast software algorithm. __wt_cksum() provides a common entry point that
+ * indirects to one of these two methods.
+ */
+static uint32_t (*__wt_cksum_func)(const void *chunk, size_t len);
+
+/*
+ * The CRC slicing tables are used by __wt_cksum_sw.
+ */
+static const uint32_t g_crc_slicing[8][256] = {
+#ifdef WORDS_BIGENDIAN
+ /*
+ * Big endian tables have entries that are byte reversed from little
+ * endian tables.
+ */
+ {
+ 0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013,
+ 0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4,
+ 0xcf58d98a, 0xccdbb278, 0x3828e26b, 0x3bab8999,
+ 0xd0cf434d, 0xd34c28bf, 0x27bf78ac, 0x243c135e,
+ 0x6fc75e10, 0x6c4435e2, 0x98b765f1, 0x9b340e03,
+ 0x7050c4d7, 0x73d3af25, 0x8720ff36, 0x84a394c4,
+ 0xa09f879a, 0xa31cec68, 0x57efbc7b, 0x546cd789,
+ 0xbf081d5d, 0xbc8b76af, 0x487826bc, 0x4bfb4d4e,
+ 0xde8ebd20, 0xdd0dd6d2, 0x29fe86c1, 0x2a7ded33,
+ 0xc11927e7, 0xc29a4c15, 0x36691c06, 0x35ea77f4,
+ 0x11d664aa, 0x12550f58, 0xe6a65f4b, 0xe52534b9,
+ 0x0e41fe6d, 0x0dc2959f, 0xf931c58c, 0xfab2ae7e,
+ 0xb149e330, 0xb2ca88c2, 0x4639d8d1, 0x45bab323,
+ 0xaede79f7, 0xad5d1205, 0x59ae4216, 0x5a2d29e4,
+ 0x7e113aba, 0x7d925148, 0x8961015b, 0x8ae26aa9,
+ 0x6186a07d, 0x6205cb8f, 0x96f69b9c, 0x9575f06e,
+ 0xbc1d7b41, 0xbf9e10b3, 0x4b6d40a0, 0x48ee2b52,
+ 0xa38ae186, 0xa0098a74, 0x54fada67, 0x5779b195,
+ 0x7345a2cb, 0x70c6c939, 0x8435992a, 0x87b6f2d8,
+ 0x6cd2380c, 0x6f5153fe, 0x9ba203ed, 0x9821681f,
+ 0xd3da2551, 0xd0594ea3, 0x24aa1eb0, 0x27297542,
+ 0xcc4dbf96, 0xcfced464, 0x3b3d8477, 0x38beef85,
+ 0x1c82fcdb, 0x1f019729, 0xebf2c73a, 0xe871acc8,
+ 0x0315661c, 0x00960dee, 0xf4655dfd, 0xf7e6360f,
+ 0x6293c661, 0x6110ad93, 0x95e3fd80, 0x96609672,
+ 0x7d045ca6, 0x7e873754, 0x8a746747, 0x89f70cb5,
+ 0xadcb1feb, 0xae487419, 0x5abb240a, 0x59384ff8,
+ 0xb25c852c, 0xb1dfeede, 0x452cbecd, 0x46afd53f,
+ 0x0d549871, 0x0ed7f383, 0xfa24a390, 0xf9a7c862,
+ 0x12c302b6, 0x11406944, 0xe5b33957, 0xe63052a5,
+ 0xc20c41fb, 0xc18f2a09, 0x357c7a1a, 0x36ff11e8,
+ 0xdd9bdb3c, 0xde18b0ce, 0x2aebe0dd, 0x29688b2f,
+ 0x783bf682, 0x7bb89d70, 0x8f4bcd63, 0x8cc8a691,
+ 0x67ac6c45, 0x642f07b7, 0x90dc57a4, 0x935f3c56,
+ 0xb7632f08, 0xb4e044fa, 0x401314e9, 0x43907f1b,
+ 0xa8f4b5cf, 0xab77de3d, 0x5f848e2e, 0x5c07e5dc,
+ 0x17fca892, 0x147fc360, 0xe08c9373, 0xe30ff881,
+ 0x086b3255, 0x0be859a7, 0xff1b09b4, 0xfc986246,
+ 0xd8a47118, 0xdb271aea, 0x2fd44af9, 0x2c57210b,
+ 0xc733ebdf, 0xc4b0802d, 0x3043d03e, 0x33c0bbcc,
+ 0xa6b54ba2, 0xa5362050, 0x51c57043, 0x52461bb1,
+ 0xb922d165, 0xbaa1ba97, 0x4e52ea84, 0x4dd18176,
+ 0x69ed9228, 0x6a6ef9da, 0x9e9da9c9, 0x9d1ec23b,
+ 0x767a08ef, 0x75f9631d, 0x810a330e, 0x828958fc,
+ 0xc97215b2, 0xcaf17e40, 0x3e022e53, 0x3d8145a1,
+ 0xd6e58f75, 0xd566e487, 0x2195b494, 0x2216df66,
+ 0x062acc38, 0x05a9a7ca, 0xf15af7d9, 0xf2d99c2b,
+ 0x19bd56ff, 0x1a3e3d0d, 0xeecd6d1e, 0xed4e06ec,
+ 0xc4268dc3, 0xc7a5e631, 0x3356b622, 0x30d5ddd0,
+ 0xdbb11704, 0xd8327cf6, 0x2cc12ce5, 0x2f424717,
+ 0x0b7e5449, 0x08fd3fbb, 0xfc0e6fa8, 0xff8d045a,
+ 0x14e9ce8e, 0x176aa57c, 0xe399f56f, 0xe01a9e9d,
+ 0xabe1d3d3, 0xa862b821, 0x5c91e832, 0x5f1283c0,
+ 0xb4764914, 0xb7f522e6, 0x430672f5, 0x40851907,
+ 0x64b90a59, 0x673a61ab, 0x93c931b8, 0x904a5a4a,
+ 0x7b2e909e, 0x78adfb6c, 0x8c5eab7f, 0x8fddc08d,
+ 0x1aa830e3, 0x192b5b11, 0xedd80b02, 0xee5b60f0,
+ 0x053faa24, 0x06bcc1d6, 0xf24f91c5, 0xf1ccfa37,
+ 0xd5f0e969, 0xd673829b, 0x2280d288, 0x2103b97a,
+ 0xca6773ae, 0xc9e4185c, 0x3d17484f, 0x3e9423bd,
+ 0x756f6ef3, 0x76ec0501, 0x821f5512, 0x819c3ee0,
+ 0x6af8f434, 0x697b9fc6, 0x9d88cfd5, 0x9e0ba427,
+ 0xba37b779, 0xb9b4dc8b, 0x4d478c98, 0x4ec4e76a,
+ 0xa5a02dbe, 0xa623464c, 0x52d0165f, 0x51537dad
+ },{
+ 0x00000000, 0x7798a213, 0xee304527, 0x99a8e734,
+ 0xdc618a4e, 0xabf9285d, 0x3251cf69, 0x45c96d7a,
+ 0xb8c3149d, 0xcf5bb68e, 0x56f351ba, 0x216bf3a9,
+ 0x64a29ed3, 0x133a3cc0, 0x8a92dbf4, 0xfd0a79e7,
+ 0x81f1c53f, 0xf669672c, 0x6fc18018, 0x1859220b,
+ 0x5d904f71, 0x2a08ed62, 0xb3a00a56, 0xc438a845,
+ 0x3932d1a2, 0x4eaa73b1, 0xd7029485, 0xa09a3696,
+ 0xe5535bec, 0x92cbf9ff, 0x0b631ecb, 0x7cfbbcd8,
+ 0x02e38b7f, 0x757b296c, 0xecd3ce58, 0x9b4b6c4b,
+ 0xde820131, 0xa91aa322, 0x30b24416, 0x472ae605,
+ 0xba209fe2, 0xcdb83df1, 0x5410dac5, 0x238878d6,
+ 0x664115ac, 0x11d9b7bf, 0x8871508b, 0xffe9f298,
+ 0x83124e40, 0xf48aec53, 0x6d220b67, 0x1abaa974,
+ 0x5f73c40e, 0x28eb661d, 0xb1438129, 0xc6db233a,
+ 0x3bd15add, 0x4c49f8ce, 0xd5e11ffa, 0xa279bde9,
+ 0xe7b0d093, 0x90287280, 0x098095b4, 0x7e1837a7,
+ 0x04c617ff, 0x735eb5ec, 0xeaf652d8, 0x9d6ef0cb,
+ 0xd8a79db1, 0xaf3f3fa2, 0x3697d896, 0x410f7a85,
+ 0xbc050362, 0xcb9da171, 0x52354645, 0x25ade456,
+ 0x6064892c, 0x17fc2b3f, 0x8e54cc0b, 0xf9cc6e18,
+ 0x8537d2c0, 0xf2af70d3, 0x6b0797e7, 0x1c9f35f4,
+ 0x5956588e, 0x2ecefa9d, 0xb7661da9, 0xc0febfba,
+ 0x3df4c65d, 0x4a6c644e, 0xd3c4837a, 0xa45c2169,
+ 0xe1954c13, 0x960dee00, 0x0fa50934, 0x783dab27,
+ 0x06259c80, 0x71bd3e93, 0xe815d9a7, 0x9f8d7bb4,
+ 0xda4416ce, 0xaddcb4dd, 0x347453e9, 0x43ecf1fa,
+ 0xbee6881d, 0xc97e2a0e, 0x50d6cd3a, 0x274e6f29,
+ 0x62870253, 0x151fa040, 0x8cb74774, 0xfb2fe567,
+ 0x87d459bf, 0xf04cfbac, 0x69e41c98, 0x1e7cbe8b,
+ 0x5bb5d3f1, 0x2c2d71e2, 0xb58596d6, 0xc21d34c5,
+ 0x3f174d22, 0x488fef31, 0xd1270805, 0xa6bfaa16,
+ 0xe376c76c, 0x94ee657f, 0x0d46824b, 0x7ade2058,
+ 0xf9fac3fb, 0x8e6261e8, 0x17ca86dc, 0x605224cf,
+ 0x259b49b5, 0x5203eba6, 0xcbab0c92, 0xbc33ae81,
+ 0x4139d766, 0x36a17575, 0xaf099241, 0xd8913052,
+ 0x9d585d28, 0xeac0ff3b, 0x7368180f, 0x04f0ba1c,
+ 0x780b06c4, 0x0f93a4d7, 0x963b43e3, 0xe1a3e1f0,
+ 0xa46a8c8a, 0xd3f22e99, 0x4a5ac9ad, 0x3dc26bbe,
+ 0xc0c81259, 0xb750b04a, 0x2ef8577e, 0x5960f56d,
+ 0x1ca99817, 0x6b313a04, 0xf299dd30, 0x85017f23,
+ 0xfb194884, 0x8c81ea97, 0x15290da3, 0x62b1afb0,
+ 0x2778c2ca, 0x50e060d9, 0xc94887ed, 0xbed025fe,
+ 0x43da5c19, 0x3442fe0a, 0xadea193e, 0xda72bb2d,
+ 0x9fbbd657, 0xe8237444, 0x718b9370, 0x06133163,
+ 0x7ae88dbb, 0x0d702fa8, 0x94d8c89c, 0xe3406a8f,
+ 0xa68907f5, 0xd111a5e6, 0x48b942d2, 0x3f21e0c1,
+ 0xc22b9926, 0xb5b33b35, 0x2c1bdc01, 0x5b837e12,
+ 0x1e4a1368, 0x69d2b17b, 0xf07a564f, 0x87e2f45c,
+ 0xfd3cd404, 0x8aa47617, 0x130c9123, 0x64943330,
+ 0x215d5e4a, 0x56c5fc59, 0xcf6d1b6d, 0xb8f5b97e,
+ 0x45ffc099, 0x3267628a, 0xabcf85be, 0xdc5727ad,
+ 0x999e4ad7, 0xee06e8c4, 0x77ae0ff0, 0x0036ade3,
+ 0x7ccd113b, 0x0b55b328, 0x92fd541c, 0xe565f60f,
+ 0xa0ac9b75, 0xd7343966, 0x4e9cde52, 0x39047c41,
+ 0xc40e05a6, 0xb396a7b5, 0x2a3e4081, 0x5da6e292,
+ 0x186f8fe8, 0x6ff72dfb, 0xf65fcacf, 0x81c768dc,
+ 0xffdf5f7b, 0x8847fd68, 0x11ef1a5c, 0x6677b84f,
+ 0x23bed535, 0x54267726, 0xcd8e9012, 0xba163201,
+ 0x471c4be6, 0x3084e9f5, 0xa92c0ec1, 0xdeb4acd2,
+ 0x9b7dc1a8, 0xece563bb, 0x754d848f, 0x02d5269c,
+ 0x7e2e9a44, 0x09b63857, 0x901edf63, 0xe7867d70,
+ 0xa24f100a, 0xd5d7b219, 0x4c7f552d, 0x3be7f73e,
+ 0xc6ed8ed9, 0xb1752cca, 0x28ddcbfe, 0x5f4569ed,
+ 0x1a8c0497, 0x6d14a684, 0xf4bc41b0, 0x8324e3a3
+ },{
+ 0x00000000, 0x7e9241a5, 0x0d526f4f, 0x73c02eea,
+ 0x1aa4de9e, 0x64369f3b, 0x17f6b1d1, 0x6964f074,
+ 0xc53e5138, 0xbbac109d, 0xc86c3e77, 0xb6fe7fd2,
+ 0xdf9a8fa6, 0xa108ce03, 0xd2c8e0e9, 0xac5aa14c,
+ 0x8a7da270, 0xf4efe3d5, 0x872fcd3f, 0xf9bd8c9a,
+ 0x90d97cee, 0xee4b3d4b, 0x9d8b13a1, 0xe3195204,
+ 0x4f43f348, 0x31d1b2ed, 0x42119c07, 0x3c83dda2,
+ 0x55e72dd6, 0x2b756c73, 0x58b54299, 0x2627033c,
+ 0x14fb44e1, 0x6a690544, 0x19a92bae, 0x673b6a0b,
+ 0x0e5f9a7f, 0x70cddbda, 0x030df530, 0x7d9fb495,
+ 0xd1c515d9, 0xaf57547c, 0xdc977a96, 0xa2053b33,
+ 0xcb61cb47, 0xb5f38ae2, 0xc633a408, 0xb8a1e5ad,
+ 0x9e86e691, 0xe014a734, 0x93d489de, 0xed46c87b,
+ 0x8422380f, 0xfab079aa, 0x89705740, 0xf7e216e5,
+ 0x5bb8b7a9, 0x252af60c, 0x56ead8e6, 0x28789943,
+ 0x411c6937, 0x3f8e2892, 0x4c4e0678, 0x32dc47dd,
+ 0xd98065c7, 0xa7122462, 0xd4d20a88, 0xaa404b2d,
+ 0xc324bb59, 0xbdb6fafc, 0xce76d416, 0xb0e495b3,
+ 0x1cbe34ff, 0x622c755a, 0x11ec5bb0, 0x6f7e1a15,
+ 0x061aea61, 0x7888abc4, 0x0b48852e, 0x75dac48b,
+ 0x53fdc7b7, 0x2d6f8612, 0x5eafa8f8, 0x203de95d,
+ 0x49591929, 0x37cb588c, 0x440b7666, 0x3a9937c3,
+ 0x96c3968f, 0xe851d72a, 0x9b91f9c0, 0xe503b865,
+ 0x8c674811, 0xf2f509b4, 0x8135275e, 0xffa766fb,
+ 0xcd7b2126, 0xb3e96083, 0xc0294e69, 0xbebb0fcc,
+ 0xd7dfffb8, 0xa94dbe1d, 0xda8d90f7, 0xa41fd152,
+ 0x0845701e, 0x76d731bb, 0x05171f51, 0x7b855ef4,
+ 0x12e1ae80, 0x6c73ef25, 0x1fb3c1cf, 0x6121806a,
+ 0x47068356, 0x3994c2f3, 0x4a54ec19, 0x34c6adbc,
+ 0x5da25dc8, 0x23301c6d, 0x50f03287, 0x2e627322,
+ 0x8238d26e, 0xfcaa93cb, 0x8f6abd21, 0xf1f8fc84,
+ 0x989c0cf0, 0xe60e4d55, 0x95ce63bf, 0xeb5c221a,
+ 0x4377278b, 0x3de5662e, 0x4e2548c4, 0x30b70961,
+ 0x59d3f915, 0x2741b8b0, 0x5481965a, 0x2a13d7ff,
+ 0x864976b3, 0xf8db3716, 0x8b1b19fc, 0xf5895859,
+ 0x9ceda82d, 0xe27fe988, 0x91bfc762, 0xef2d86c7,
+ 0xc90a85fb, 0xb798c45e, 0xc458eab4, 0xbacaab11,
+ 0xd3ae5b65, 0xad3c1ac0, 0xdefc342a, 0xa06e758f,
+ 0x0c34d4c3, 0x72a69566, 0x0166bb8c, 0x7ff4fa29,
+ 0x16900a5d, 0x68024bf8, 0x1bc26512, 0x655024b7,
+ 0x578c636a, 0x291e22cf, 0x5ade0c25, 0x244c4d80,
+ 0x4d28bdf4, 0x33bafc51, 0x407ad2bb, 0x3ee8931e,
+ 0x92b23252, 0xec2073f7, 0x9fe05d1d, 0xe1721cb8,
+ 0x8816eccc, 0xf684ad69, 0x85448383, 0xfbd6c226,
+ 0xddf1c11a, 0xa36380bf, 0xd0a3ae55, 0xae31eff0,
+ 0xc7551f84, 0xb9c75e21, 0xca0770cb, 0xb495316e,
+ 0x18cf9022, 0x665dd187, 0x159dff6d, 0x6b0fbec8,
+ 0x026b4ebc, 0x7cf90f19, 0x0f3921f3, 0x71ab6056,
+ 0x9af7424c, 0xe46503e9, 0x97a52d03, 0xe9376ca6,
+ 0x80539cd2, 0xfec1dd77, 0x8d01f39d, 0xf393b238,
+ 0x5fc91374, 0x215b52d1, 0x529b7c3b, 0x2c093d9e,
+ 0x456dcdea, 0x3bff8c4f, 0x483fa2a5, 0x36ade300,
+ 0x108ae03c, 0x6e18a199, 0x1dd88f73, 0x634aced6,
+ 0x0a2e3ea2, 0x74bc7f07, 0x077c51ed, 0x79ee1048,
+ 0xd5b4b104, 0xab26f0a1, 0xd8e6de4b, 0xa6749fee,
+ 0xcf106f9a, 0xb1822e3f, 0xc24200d5, 0xbcd04170,
+ 0x8e0c06ad, 0xf09e4708, 0x835e69e2, 0xfdcc2847,
+ 0x94a8d833, 0xea3a9996, 0x99fab77c, 0xe768f6d9,
+ 0x4b325795, 0x35a01630, 0x466038da, 0x38f2797f,
+ 0x5196890b, 0x2f04c8ae, 0x5cc4e644, 0x2256a7e1,
+ 0x0471a4dd, 0x7ae3e578, 0x0923cb92, 0x77b18a37,
+ 0x1ed57a43, 0x60473be6, 0x1387150c, 0x6d1554a9,
+ 0xc14ff5e5, 0xbfddb440, 0xcc1d9aaa, 0xb28fdb0f,
+ 0xdbeb2b7b, 0xa5796ade, 0xd6b94434, 0xa82b0591
+ },{
+ 0x00000000, 0xb8aa45dd, 0x812367bf, 0x39892262,
+ 0xf331227b, 0x4b9b67a6, 0x721245c4, 0xcab80019,
+ 0xe66344f6, 0x5ec9012b, 0x67402349, 0xdfea6694,
+ 0x1552668d, 0xadf82350, 0x94710132, 0x2cdb44ef,
+ 0x3db164e9, 0x851b2134, 0xbc920356, 0x0438468b,
+ 0xce804692, 0x762a034f, 0x4fa3212d, 0xf70964f0,
+ 0xdbd2201f, 0x637865c2, 0x5af147a0, 0xe25b027d,
+ 0x28e30264, 0x904947b9, 0xa9c065db, 0x116a2006,
+ 0x8b1425d7, 0x33be600a, 0x0a374268, 0xb29d07b5,
+ 0x782507ac, 0xc08f4271, 0xf9066013, 0x41ac25ce,
+ 0x6d776121, 0xd5dd24fc, 0xec54069e, 0x54fe4343,
+ 0x9e46435a, 0x26ec0687, 0x1f6524e5, 0xa7cf6138,
+ 0xb6a5413e, 0x0e0f04e3, 0x37862681, 0x8f2c635c,
+ 0x45946345, 0xfd3e2698, 0xc4b704fa, 0x7c1d4127,
+ 0x50c605c8, 0xe86c4015, 0xd1e56277, 0x694f27aa,
+ 0xa3f727b3, 0x1b5d626e, 0x22d4400c, 0x9a7e05d1,
+ 0xe75fa6ab, 0x5ff5e376, 0x667cc114, 0xded684c9,
+ 0x146e84d0, 0xacc4c10d, 0x954de36f, 0x2de7a6b2,
+ 0x013ce25d, 0xb996a780, 0x801f85e2, 0x38b5c03f,
+ 0xf20dc026, 0x4aa785fb, 0x732ea799, 0xcb84e244,
+ 0xdaeec242, 0x6244879f, 0x5bcda5fd, 0xe367e020,
+ 0x29dfe039, 0x9175a5e4, 0xa8fc8786, 0x1056c25b,
+ 0x3c8d86b4, 0x8427c369, 0xbdaee10b, 0x0504a4d6,
+ 0xcfbca4cf, 0x7716e112, 0x4e9fc370, 0xf63586ad,
+ 0x6c4b837c, 0xd4e1c6a1, 0xed68e4c3, 0x55c2a11e,
+ 0x9f7aa107, 0x27d0e4da, 0x1e59c6b8, 0xa6f38365,
+ 0x8a28c78a, 0x32828257, 0x0b0ba035, 0xb3a1e5e8,
+ 0x7919e5f1, 0xc1b3a02c, 0xf83a824e, 0x4090c793,
+ 0x51fae795, 0xe950a248, 0xd0d9802a, 0x6873c5f7,
+ 0xa2cbc5ee, 0x1a618033, 0x23e8a251, 0x9b42e78c,
+ 0xb799a363, 0x0f33e6be, 0x36bac4dc, 0x8e108101,
+ 0x44a88118, 0xfc02c4c5, 0xc58be6a7, 0x7d21a37a,
+ 0x3fc9a052, 0x8763e58f, 0xbeeac7ed, 0x06408230,
+ 0xccf88229, 0x7452c7f4, 0x4ddbe596, 0xf571a04b,
+ 0xd9aae4a4, 0x6100a179, 0x5889831b, 0xe023c6c6,
+ 0x2a9bc6df, 0x92318302, 0xabb8a160, 0x1312e4bd,
+ 0x0278c4bb, 0xbad28166, 0x835ba304, 0x3bf1e6d9,
+ 0xf149e6c0, 0x49e3a31d, 0x706a817f, 0xc8c0c4a2,
+ 0xe41b804d, 0x5cb1c590, 0x6538e7f2, 0xdd92a22f,
+ 0x172aa236, 0xaf80e7eb, 0x9609c589, 0x2ea38054,
+ 0xb4dd8585, 0x0c77c058, 0x35fee23a, 0x8d54a7e7,
+ 0x47eca7fe, 0xff46e223, 0xc6cfc041, 0x7e65859c,
+ 0x52bec173, 0xea1484ae, 0xd39da6cc, 0x6b37e311,
+ 0xa18fe308, 0x1925a6d5, 0x20ac84b7, 0x9806c16a,
+ 0x896ce16c, 0x31c6a4b1, 0x084f86d3, 0xb0e5c30e,
+ 0x7a5dc317, 0xc2f786ca, 0xfb7ea4a8, 0x43d4e175,
+ 0x6f0fa59a, 0xd7a5e047, 0xee2cc225, 0x568687f8,
+ 0x9c3e87e1, 0x2494c23c, 0x1d1de05e, 0xa5b7a583,
+ 0xd89606f9, 0x603c4324, 0x59b56146, 0xe11f249b,
+ 0x2ba72482, 0x930d615f, 0xaa84433d, 0x122e06e0,
+ 0x3ef5420f, 0x865f07d2, 0xbfd625b0, 0x077c606d,
+ 0xcdc46074, 0x756e25a9, 0x4ce707cb, 0xf44d4216,
+ 0xe5276210, 0x5d8d27cd, 0x640405af, 0xdcae4072,
+ 0x1616406b, 0xaebc05b6, 0x973527d4, 0x2f9f6209,
+ 0x034426e6, 0xbbee633b, 0x82674159, 0x3acd0484,
+ 0xf075049d, 0x48df4140, 0x71566322, 0xc9fc26ff,
+ 0x5382232e, 0xeb2866f3, 0xd2a14491, 0x6a0b014c,
+ 0xa0b30155, 0x18194488, 0x219066ea, 0x993a2337,
+ 0xb5e167d8, 0x0d4b2205, 0x34c20067, 0x8c6845ba,
+ 0x46d045a3, 0xfe7a007e, 0xc7f3221c, 0x7f5967c1,
+ 0x6e3347c7, 0xd699021a, 0xef102078, 0x57ba65a5,
+ 0x9d0265bc, 0x25a82061, 0x1c210203, 0xa48b47de,
+ 0x88500331, 0x30fa46ec, 0x0973648e, 0xb1d92153,
+ 0x7b61214a, 0xc3cb6497, 0xfa4246f5, 0x42e80328
+ },{
+ 0x00000000, 0xac6f1138, 0x58df2270, 0xf4b03348,
+ 0xb0be45e0, 0x1cd154d8, 0xe8616790, 0x440e76a8,
+ 0x910b67c5, 0x3d6476fd, 0xc9d445b5, 0x65bb548d,
+ 0x21b52225, 0x8dda331d, 0x796a0055, 0xd505116d,
+ 0xd361228f, 0x7f0e33b7, 0x8bbe00ff, 0x27d111c7,
+ 0x63df676f, 0xcfb07657, 0x3b00451f, 0x976f5427,
+ 0x426a454a, 0xee055472, 0x1ab5673a, 0xb6da7602,
+ 0xf2d400aa, 0x5ebb1192, 0xaa0b22da, 0x066433e2,
+ 0x57b5a81b, 0xfbdab923, 0x0f6a8a6b, 0xa3059b53,
+ 0xe70bedfb, 0x4b64fcc3, 0xbfd4cf8b, 0x13bbdeb3,
+ 0xc6becfde, 0x6ad1dee6, 0x9e61edae, 0x320efc96,
+ 0x76008a3e, 0xda6f9b06, 0x2edfa84e, 0x82b0b976,
+ 0x84d48a94, 0x28bb9bac, 0xdc0ba8e4, 0x7064b9dc,
+ 0x346acf74, 0x9805de4c, 0x6cb5ed04, 0xc0dafc3c,
+ 0x15dfed51, 0xb9b0fc69, 0x4d00cf21, 0xe16fde19,
+ 0xa561a8b1, 0x090eb989, 0xfdbe8ac1, 0x51d19bf9,
+ 0xae6a5137, 0x0205400f, 0xf6b57347, 0x5ada627f,
+ 0x1ed414d7, 0xb2bb05ef, 0x460b36a7, 0xea64279f,
+ 0x3f6136f2, 0x930e27ca, 0x67be1482, 0xcbd105ba,
+ 0x8fdf7312, 0x23b0622a, 0xd7005162, 0x7b6f405a,
+ 0x7d0b73b8, 0xd1646280, 0x25d451c8, 0x89bb40f0,
+ 0xcdb53658, 0x61da2760, 0x956a1428, 0x39050510,
+ 0xec00147d, 0x406f0545, 0xb4df360d, 0x18b02735,
+ 0x5cbe519d, 0xf0d140a5, 0x046173ed, 0xa80e62d5,
+ 0xf9dff92c, 0x55b0e814, 0xa100db5c, 0x0d6fca64,
+ 0x4961bccc, 0xe50eadf4, 0x11be9ebc, 0xbdd18f84,
+ 0x68d49ee9, 0xc4bb8fd1, 0x300bbc99, 0x9c64ada1,
+ 0xd86adb09, 0x7405ca31, 0x80b5f979, 0x2cdae841,
+ 0x2abedba3, 0x86d1ca9b, 0x7261f9d3, 0xde0ee8eb,
+ 0x9a009e43, 0x366f8f7b, 0xc2dfbc33, 0x6eb0ad0b,
+ 0xbbb5bc66, 0x17daad5e, 0xe36a9e16, 0x4f058f2e,
+ 0x0b0bf986, 0xa764e8be, 0x53d4dbf6, 0xffbbcace,
+ 0x5cd5a26e, 0xf0bab356, 0x040a801e, 0xa8659126,
+ 0xec6be78e, 0x4004f6b6, 0xb4b4c5fe, 0x18dbd4c6,
+ 0xcddec5ab, 0x61b1d493, 0x9501e7db, 0x396ef6e3,
+ 0x7d60804b, 0xd10f9173, 0x25bfa23b, 0x89d0b303,
+ 0x8fb480e1, 0x23db91d9, 0xd76ba291, 0x7b04b3a9,
+ 0x3f0ac501, 0x9365d439, 0x67d5e771, 0xcbbaf649,
+ 0x1ebfe724, 0xb2d0f61c, 0x4660c554, 0xea0fd46c,
+ 0xae01a2c4, 0x026eb3fc, 0xf6de80b4, 0x5ab1918c,
+ 0x0b600a75, 0xa70f1b4d, 0x53bf2805, 0xffd0393d,
+ 0xbbde4f95, 0x17b15ead, 0xe3016de5, 0x4f6e7cdd,
+ 0x9a6b6db0, 0x36047c88, 0xc2b44fc0, 0x6edb5ef8,
+ 0x2ad52850, 0x86ba3968, 0x720a0a20, 0xde651b18,
+ 0xd80128fa, 0x746e39c2, 0x80de0a8a, 0x2cb11bb2,
+ 0x68bf6d1a, 0xc4d07c22, 0x30604f6a, 0x9c0f5e52,
+ 0x490a4f3f, 0xe5655e07, 0x11d56d4f, 0xbdba7c77,
+ 0xf9b40adf, 0x55db1be7, 0xa16b28af, 0x0d043997,
+ 0xf2bff359, 0x5ed0e261, 0xaa60d129, 0x060fc011,
+ 0x4201b6b9, 0xee6ea781, 0x1ade94c9, 0xb6b185f1,
+ 0x63b4949c, 0xcfdb85a4, 0x3b6bb6ec, 0x9704a7d4,
+ 0xd30ad17c, 0x7f65c044, 0x8bd5f30c, 0x27bae234,
+ 0x21ded1d6, 0x8db1c0ee, 0x7901f3a6, 0xd56ee29e,
+ 0x91609436, 0x3d0f850e, 0xc9bfb646, 0x65d0a77e,
+ 0xb0d5b613, 0x1cbaa72b, 0xe80a9463, 0x4465855b,
+ 0x006bf3f3, 0xac04e2cb, 0x58b4d183, 0xf4dbc0bb,
+ 0xa50a5b42, 0x09654a7a, 0xfdd57932, 0x51ba680a,
+ 0x15b41ea2, 0xb9db0f9a, 0x4d6b3cd2, 0xe1042dea,
+ 0x34013c87, 0x986e2dbf, 0x6cde1ef7, 0xc0b10fcf,
+ 0x84bf7967, 0x28d0685f, 0xdc605b17, 0x700f4a2f,
+ 0x766b79cd, 0xda0468f5, 0x2eb45bbd, 0x82db4a85,
+ 0xc6d53c2d, 0x6aba2d15, 0x9e0a1e5d, 0x32650f65,
+ 0xe7601e08, 0x4b0f0f30, 0xbfbf3c78, 0x13d02d40,
+ 0x57de5be8, 0xfbb14ad0, 0x0f017998, 0xa36e68a0
+ },{
+ 0x00000000, 0x196b30ef, 0xc3a08cdb, 0xdacbbc34,
+ 0x7737f5b2, 0x6e5cc55d, 0xb4977969, 0xadfc4986,
+ 0x1f180660, 0x0673368f, 0xdcb88abb, 0xc5d3ba54,
+ 0x682ff3d2, 0x7144c33d, 0xab8f7f09, 0xb2e44fe6,
+ 0x3e300cc0, 0x275b3c2f, 0xfd90801b, 0xe4fbb0f4,
+ 0x4907f972, 0x506cc99d, 0x8aa775a9, 0x93cc4546,
+ 0x21280aa0, 0x38433a4f, 0xe288867b, 0xfbe3b694,
+ 0x561fff12, 0x4f74cffd, 0x95bf73c9, 0x8cd44326,
+ 0x8d16f485, 0x947dc46a, 0x4eb6785e, 0x57dd48b1,
+ 0xfa210137, 0xe34a31d8, 0x39818dec, 0x20eabd03,
+ 0x920ef2e5, 0x8b65c20a, 0x51ae7e3e, 0x48c54ed1,
+ 0xe5390757, 0xfc5237b8, 0x26998b8c, 0x3ff2bb63,
+ 0xb326f845, 0xaa4dc8aa, 0x7086749e, 0x69ed4471,
+ 0xc4110df7, 0xdd7a3d18, 0x07b1812c, 0x1edab1c3,
+ 0xac3efe25, 0xb555ceca, 0x6f9e72fe, 0x76f54211,
+ 0xdb090b97, 0xc2623b78, 0x18a9874c, 0x01c2b7a3,
+ 0xeb5b040e, 0xf23034e1, 0x28fb88d5, 0x3190b83a,
+ 0x9c6cf1bc, 0x8507c153, 0x5fcc7d67, 0x46a74d88,
+ 0xf443026e, 0xed283281, 0x37e38eb5, 0x2e88be5a,
+ 0x8374f7dc, 0x9a1fc733, 0x40d47b07, 0x59bf4be8,
+ 0xd56b08ce, 0xcc003821, 0x16cb8415, 0x0fa0b4fa,
+ 0xa25cfd7c, 0xbb37cd93, 0x61fc71a7, 0x78974148,
+ 0xca730eae, 0xd3183e41, 0x09d38275, 0x10b8b29a,
+ 0xbd44fb1c, 0xa42fcbf3, 0x7ee477c7, 0x678f4728,
+ 0x664df08b, 0x7f26c064, 0xa5ed7c50, 0xbc864cbf,
+ 0x117a0539, 0x081135d6, 0xd2da89e2, 0xcbb1b90d,
+ 0x7955f6eb, 0x603ec604, 0xbaf57a30, 0xa39e4adf,
+ 0x0e620359, 0x170933b6, 0xcdc28f82, 0xd4a9bf6d,
+ 0x587dfc4b, 0x4116cca4, 0x9bdd7090, 0x82b6407f,
+ 0x2f4a09f9, 0x36213916, 0xecea8522, 0xf581b5cd,
+ 0x4765fa2b, 0x5e0ecac4, 0x84c576f0, 0x9dae461f,
+ 0x30520f99, 0x29393f76, 0xf3f28342, 0xea99b3ad,
+ 0xd6b7081c, 0xcfdc38f3, 0x151784c7, 0x0c7cb428,
+ 0xa180fdae, 0xb8ebcd41, 0x62207175, 0x7b4b419a,
+ 0xc9af0e7c, 0xd0c43e93, 0x0a0f82a7, 0x1364b248,
+ 0xbe98fbce, 0xa7f3cb21, 0x7d387715, 0x645347fa,
+ 0xe88704dc, 0xf1ec3433, 0x2b278807, 0x324cb8e8,
+ 0x9fb0f16e, 0x86dbc181, 0x5c107db5, 0x457b4d5a,
+ 0xf79f02bc, 0xeef43253, 0x343f8e67, 0x2d54be88,
+ 0x80a8f70e, 0x99c3c7e1, 0x43087bd5, 0x5a634b3a,
+ 0x5ba1fc99, 0x42cacc76, 0x98017042, 0x816a40ad,
+ 0x2c96092b, 0x35fd39c4, 0xef3685f0, 0xf65db51f,
+ 0x44b9faf9, 0x5dd2ca16, 0x87197622, 0x9e7246cd,
+ 0x338e0f4b, 0x2ae53fa4, 0xf02e8390, 0xe945b37f,
+ 0x6591f059, 0x7cfac0b6, 0xa6317c82, 0xbf5a4c6d,
+ 0x12a605eb, 0x0bcd3504, 0xd1068930, 0xc86db9df,
+ 0x7a89f639, 0x63e2c6d6, 0xb9297ae2, 0xa0424a0d,
+ 0x0dbe038b, 0x14d53364, 0xce1e8f50, 0xd775bfbf,
+ 0x3dec0c12, 0x24873cfd, 0xfe4c80c9, 0xe727b026,
+ 0x4adbf9a0, 0x53b0c94f, 0x897b757b, 0x90104594,
+ 0x22f40a72, 0x3b9f3a9d, 0xe15486a9, 0xf83fb646,
+ 0x55c3ffc0, 0x4ca8cf2f, 0x9663731b, 0x8f0843f4,
+ 0x03dc00d2, 0x1ab7303d, 0xc07c8c09, 0xd917bce6,
+ 0x74ebf560, 0x6d80c58f, 0xb74b79bb, 0xae204954,
+ 0x1cc406b2, 0x05af365d, 0xdf648a69, 0xc60fba86,
+ 0x6bf3f300, 0x7298c3ef, 0xa8537fdb, 0xb1384f34,
+ 0xb0faf897, 0xa991c878, 0x735a744c, 0x6a3144a3,
+ 0xc7cd0d25, 0xdea63dca, 0x046d81fe, 0x1d06b111,
+ 0xafe2fef7, 0xb689ce18, 0x6c42722c, 0x752942c3,
+ 0xd8d50b45, 0xc1be3baa, 0x1b75879e, 0x021eb771,
+ 0x8ecaf457, 0x97a1c4b8, 0x4d6a788c, 0x54014863,
+ 0xf9fd01e5, 0xe096310a, 0x3a5d8d3e, 0x2336bdd1,
+ 0x91d2f237, 0x88b9c2d8, 0x52727eec, 0x4b194e03,
+ 0xe6e50785, 0xff8e376a, 0x25458b5e, 0x3c2ebbb1
+ },{
+ 0x00000000, 0xc82c0368, 0x905906d0, 0x587505b8,
+ 0xd1c5e0a5, 0x19e9e3cd, 0x419ce675, 0x89b0e51d,
+ 0x53fd2d4e, 0x9bd12e26, 0xc3a42b9e, 0x0b8828f6,
+ 0x8238cdeb, 0x4a14ce83, 0x1261cb3b, 0xda4dc853,
+ 0xa6fa5b9c, 0x6ed658f4, 0x36a35d4c, 0xfe8f5e24,
+ 0x773fbb39, 0xbf13b851, 0xe766bde9, 0x2f4abe81,
+ 0xf50776d2, 0x3d2b75ba, 0x655e7002, 0xad72736a,
+ 0x24c29677, 0xecee951f, 0xb49b90a7, 0x7cb793cf,
+ 0xbd835b3d, 0x75af5855, 0x2dda5ded, 0xe5f65e85,
+ 0x6c46bb98, 0xa46ab8f0, 0xfc1fbd48, 0x3433be20,
+ 0xee7e7673, 0x2652751b, 0x7e2770a3, 0xb60b73cb,
+ 0x3fbb96d6, 0xf79795be, 0xafe29006, 0x67ce936e,
+ 0x1b7900a1, 0xd35503c9, 0x8b200671, 0x430c0519,
+ 0xcabce004, 0x0290e36c, 0x5ae5e6d4, 0x92c9e5bc,
+ 0x48842def, 0x80a82e87, 0xd8dd2b3f, 0x10f12857,
+ 0x9941cd4a, 0x516dce22, 0x0918cb9a, 0xc134c8f2,
+ 0x7a07b77a, 0xb22bb412, 0xea5eb1aa, 0x2272b2c2,
+ 0xabc257df, 0x63ee54b7, 0x3b9b510f, 0xf3b75267,
+ 0x29fa9a34, 0xe1d6995c, 0xb9a39ce4, 0x718f9f8c,
+ 0xf83f7a91, 0x301379f9, 0x68667c41, 0xa04a7f29,
+ 0xdcfdece6, 0x14d1ef8e, 0x4ca4ea36, 0x8488e95e,
+ 0x0d380c43, 0xc5140f2b, 0x9d610a93, 0x554d09fb,
+ 0x8f00c1a8, 0x472cc2c0, 0x1f59c778, 0xd775c410,
+ 0x5ec5210d, 0x96e92265, 0xce9c27dd, 0x06b024b5,
+ 0xc784ec47, 0x0fa8ef2f, 0x57ddea97, 0x9ff1e9ff,
+ 0x16410ce2, 0xde6d0f8a, 0x86180a32, 0x4e34095a,
+ 0x9479c109, 0x5c55c261, 0x0420c7d9, 0xcc0cc4b1,
+ 0x45bc21ac, 0x8d9022c4, 0xd5e5277c, 0x1dc92414,
+ 0x617eb7db, 0xa952b4b3, 0xf127b10b, 0x390bb263,
+ 0xb0bb577e, 0x78975416, 0x20e251ae, 0xe8ce52c6,
+ 0x32839a95, 0xfaaf99fd, 0xa2da9c45, 0x6af69f2d,
+ 0xe3467a30, 0x2b6a7958, 0x731f7ce0, 0xbb337f88,
+ 0xf40e6ef5, 0x3c226d9d, 0x64576825, 0xac7b6b4d,
+ 0x25cb8e50, 0xede78d38, 0xb5928880, 0x7dbe8be8,
+ 0xa7f343bb, 0x6fdf40d3, 0x37aa456b, 0xff864603,
+ 0x7636a31e, 0xbe1aa076, 0xe66fa5ce, 0x2e43a6a6,
+ 0x52f43569, 0x9ad83601, 0xc2ad33b9, 0x0a8130d1,
+ 0x8331d5cc, 0x4b1dd6a4, 0x1368d31c, 0xdb44d074,
+ 0x01091827, 0xc9251b4f, 0x91501ef7, 0x597c1d9f,
+ 0xd0ccf882, 0x18e0fbea, 0x4095fe52, 0x88b9fd3a,
+ 0x498d35c8, 0x81a136a0, 0xd9d43318, 0x11f83070,
+ 0x9848d56d, 0x5064d605, 0x0811d3bd, 0xc03dd0d5,
+ 0x1a701886, 0xd25c1bee, 0x8a291e56, 0x42051d3e,
+ 0xcbb5f823, 0x0399fb4b, 0x5becfef3, 0x93c0fd9b,
+ 0xef776e54, 0x275b6d3c, 0x7f2e6884, 0xb7026bec,
+ 0x3eb28ef1, 0xf69e8d99, 0xaeeb8821, 0x66c78b49,
+ 0xbc8a431a, 0x74a64072, 0x2cd345ca, 0xe4ff46a2,
+ 0x6d4fa3bf, 0xa563a0d7, 0xfd16a56f, 0x353aa607,
+ 0x8e09d98f, 0x4625dae7, 0x1e50df5f, 0xd67cdc37,
+ 0x5fcc392a, 0x97e03a42, 0xcf953ffa, 0x07b93c92,
+ 0xddf4f4c1, 0x15d8f7a9, 0x4dadf211, 0x8581f179,
+ 0x0c311464, 0xc41d170c, 0x9c6812b4, 0x544411dc,
+ 0x28f38213, 0xe0df817b, 0xb8aa84c3, 0x708687ab,
+ 0xf93662b6, 0x311a61de, 0x696f6466, 0xa143670e,
+ 0x7b0eaf5d, 0xb322ac35, 0xeb57a98d, 0x237baae5,
+ 0xaacb4ff8, 0x62e74c90, 0x3a924928, 0xf2be4a40,
+ 0x338a82b2, 0xfba681da, 0xa3d38462, 0x6bff870a,
+ 0xe24f6217, 0x2a63617f, 0x721664c7, 0xba3a67af,
+ 0x6077affc, 0xa85bac94, 0xf02ea92c, 0x3802aa44,
+ 0xb1b24f59, 0x799e4c31, 0x21eb4989, 0xe9c74ae1,
+ 0x9570d92e, 0x5d5cda46, 0x0529dffe, 0xcd05dc96,
+ 0x44b5398b, 0x8c993ae3, 0xd4ec3f5b, 0x1cc03c33,
+ 0xc68df460, 0x0ea1f708, 0x56d4f2b0, 0x9ef8f1d8,
+ 0x174814c5, 0xdf6417ad, 0x87111215, 0x4f3d117d
+ },{
+ 0x00000000, 0x277d3c49, 0x4efa7892, 0x698744db,
+ 0x6d821d21, 0x4aff2168, 0x237865b3, 0x040559fa,
+ 0xda043b42, 0xfd79070b, 0x94fe43d0, 0xb3837f99,
+ 0xb7862663, 0x90fb1a2a, 0xf97c5ef1, 0xde0162b8,
+ 0xb4097684, 0x93744acd, 0xfaf30e16, 0xdd8e325f,
+ 0xd98b6ba5, 0xfef657ec, 0x97711337, 0xb00c2f7e,
+ 0x6e0d4dc6, 0x4970718f, 0x20f73554, 0x078a091d,
+ 0x038f50e7, 0x24f26cae, 0x4d752875, 0x6a08143c,
+ 0x9965000d, 0xbe183c44, 0xd79f789f, 0xf0e244d6,
+ 0xf4e71d2c, 0xd39a2165, 0xba1d65be, 0x9d6059f7,
+ 0x43613b4f, 0x641c0706, 0x0d9b43dd, 0x2ae67f94,
+ 0x2ee3266e, 0x099e1a27, 0x60195efc, 0x476462b5,
+ 0x2d6c7689, 0x0a114ac0, 0x63960e1b, 0x44eb3252,
+ 0x40ee6ba8, 0x679357e1, 0x0e14133a, 0x29692f73,
+ 0xf7684dcb, 0xd0157182, 0xb9923559, 0x9eef0910,
+ 0x9aea50ea, 0xbd976ca3, 0xd4102878, 0xf36d1431,
+ 0x32cb001a, 0x15b63c53, 0x7c317888, 0x5b4c44c1,
+ 0x5f491d3b, 0x78342172, 0x11b365a9, 0x36ce59e0,
+ 0xe8cf3b58, 0xcfb20711, 0xa63543ca, 0x81487f83,
+ 0x854d2679, 0xa2301a30, 0xcbb75eeb, 0xecca62a2,
+ 0x86c2769e, 0xa1bf4ad7, 0xc8380e0c, 0xef453245,
+ 0xeb406bbf, 0xcc3d57f6, 0xa5ba132d, 0x82c72f64,
+ 0x5cc64ddc, 0x7bbb7195, 0x123c354e, 0x35410907,
+ 0x314450fd, 0x16396cb4, 0x7fbe286f, 0x58c31426,
+ 0xabae0017, 0x8cd33c5e, 0xe5547885, 0xc22944cc,
+ 0xc62c1d36, 0xe151217f, 0x88d665a4, 0xafab59ed,
+ 0x71aa3b55, 0x56d7071c, 0x3f5043c7, 0x182d7f8e,
+ 0x1c282674, 0x3b551a3d, 0x52d25ee6, 0x75af62af,
+ 0x1fa77693, 0x38da4ada, 0x515d0e01, 0x76203248,
+ 0x72256bb2, 0x555857fb, 0x3cdf1320, 0x1ba22f69,
+ 0xc5a34dd1, 0xe2de7198, 0x8b593543, 0xac24090a,
+ 0xa82150f0, 0x8f5c6cb9, 0xe6db2862, 0xc1a6142b,
+ 0x64960134, 0x43eb3d7d, 0x2a6c79a6, 0x0d1145ef,
+ 0x09141c15, 0x2e69205c, 0x47ee6487, 0x609358ce,
+ 0xbe923a76, 0x99ef063f, 0xf06842e4, 0xd7157ead,
+ 0xd3102757, 0xf46d1b1e, 0x9dea5fc5, 0xba97638c,
+ 0xd09f77b0, 0xf7e24bf9, 0x9e650f22, 0xb918336b,
+ 0xbd1d6a91, 0x9a6056d8, 0xf3e71203, 0xd49a2e4a,
+ 0x0a9b4cf2, 0x2de670bb, 0x44613460, 0x631c0829,
+ 0x671951d3, 0x40646d9a, 0x29e32941, 0x0e9e1508,
+ 0xfdf30139, 0xda8e3d70, 0xb30979ab, 0x947445e2,
+ 0x90711c18, 0xb70c2051, 0xde8b648a, 0xf9f658c3,
+ 0x27f73a7b, 0x008a0632, 0x690d42e9, 0x4e707ea0,
+ 0x4a75275a, 0x6d081b13, 0x048f5fc8, 0x23f26381,
+ 0x49fa77bd, 0x6e874bf4, 0x07000f2f, 0x207d3366,
+ 0x24786a9c, 0x030556d5, 0x6a82120e, 0x4dff2e47,
+ 0x93fe4cff, 0xb48370b6, 0xdd04346d, 0xfa790824,
+ 0xfe7c51de, 0xd9016d97, 0xb086294c, 0x97fb1505,
+ 0x565d012e, 0x71203d67, 0x18a779bc, 0x3fda45f5,
+ 0x3bdf1c0f, 0x1ca22046, 0x7525649d, 0x525858d4,
+ 0x8c593a6c, 0xab240625, 0xc2a342fe, 0xe5de7eb7,
+ 0xe1db274d, 0xc6a61b04, 0xaf215fdf, 0x885c6396,
+ 0xe25477aa, 0xc5294be3, 0xacae0f38, 0x8bd33371,
+ 0x8fd66a8b, 0xa8ab56c2, 0xc12c1219, 0xe6512e50,
+ 0x38504ce8, 0x1f2d70a1, 0x76aa347a, 0x51d70833,
+ 0x55d251c9, 0x72af6d80, 0x1b28295b, 0x3c551512,
+ 0xcf380123, 0xe8453d6a, 0x81c279b1, 0xa6bf45f8,
+ 0xa2ba1c02, 0x85c7204b, 0xec406490, 0xcb3d58d9,
+ 0x153c3a61, 0x32410628, 0x5bc642f3, 0x7cbb7eba,
+ 0x78be2740, 0x5fc31b09, 0x36445fd2, 0x1139639b,
+ 0x7b3177a7, 0x5c4c4bee, 0x35cb0f35, 0x12b6337c,
+ 0x16b36a86, 0x31ce56cf, 0x58491214, 0x7f342e5d,
+ 0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e,
+ 0xccb751c4, 0xebca6d8d, 0x824d2956, 0xa530151f
+ }
+#else
+ {
+ 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+ 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+ 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+ 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+ 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+ 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+ 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+ 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+ 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+ 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+ 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+ 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+ 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+ 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+ 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+ 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+ 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+ 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+ 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+ 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+ 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+ 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+ 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+ 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+ 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+ 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+ 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+ 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+ 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+ 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+ 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+ 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+ 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+ 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+ 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+ 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+ 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+ 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+ 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+ 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+ 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+ 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+ 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+ 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+ 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+ 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+ 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+ 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+ 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+ 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+ 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+ 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+ 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+ 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+ 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+ 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+ 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+ 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+ 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+ 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+ 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+ 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+ 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+ 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+ },{
+ 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+ 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+ 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+ 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+ 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+ 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+ 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+ 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+ 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+ 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+ 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+ 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+ 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+ 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+ 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+ 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+ 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+ 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+ 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+ 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+ 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+ 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+ 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+ 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+ 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+ 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+ 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+ 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+ 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+ 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+ 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+ 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+ 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+ 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+ 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+ 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+ 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+ 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+ 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+ 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+ 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+ 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+ 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+ 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+ 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+ 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+ 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+ 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+ 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+ 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+ 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+ 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+ 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+ 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+ 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+ 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+ 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+ 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+ 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+ 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+ 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+ 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+ 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+ 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+ },{
+ 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+ 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+ 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+ 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+ 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+ 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+ 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+ 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+ 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+ 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+ 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+ 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+ 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+ 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+ 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+ 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+ 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+ 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+ 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+ 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+ 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+ 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+ 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+ 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+ 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+ 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+ 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+ 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+ 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+ 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+ 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+ 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+ 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+ 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+ 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+ 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+ 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+ 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+ 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+ 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+ 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+ 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+ 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+ 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+ 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+ 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+ 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+ 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+ 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+ 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+ 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+ 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+ 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+ 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+ 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+ 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+ 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+ 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+ 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+ 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+ 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+ 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+ 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+ 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+ },{
+ 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+ 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+ 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+ 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+ 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+ 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+ 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+ 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+ 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+ 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+ 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+ 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+ 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+ 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+ 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+ 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+ 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+ 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+ 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+ 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+ 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+ 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+ 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+ 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+ 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+ 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+ 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+ 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+ 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+ 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+ 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+ 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+ 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+ 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+ 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+ 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+ 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+ 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+ 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+ 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+ 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+ 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+ 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+ 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+ 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+ 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+ 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+ 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+ 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+ 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+ 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+ 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+ 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+ 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+ 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+ 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+ 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+ 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+ 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+ 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+ 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+ 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+ 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+ 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+ },{
+ 0x00000000, 0x38116fac, 0x7022df58, 0x4833b0f4,
+ 0xe045beb0, 0xd854d11c, 0x906761e8, 0xa8760e44,
+ 0xc5670b91, 0xfd76643d, 0xb545d4c9, 0x8d54bb65,
+ 0x2522b521, 0x1d33da8d, 0x55006a79, 0x6d1105d5,
+ 0x8f2261d3, 0xb7330e7f, 0xff00be8b, 0xc711d127,
+ 0x6f67df63, 0x5776b0cf, 0x1f45003b, 0x27546f97,
+ 0x4a456a42, 0x725405ee, 0x3a67b51a, 0x0276dab6,
+ 0xaa00d4f2, 0x9211bb5e, 0xda220baa, 0xe2336406,
+ 0x1ba8b557, 0x23b9dafb, 0x6b8a6a0f, 0x539b05a3,
+ 0xfbed0be7, 0xc3fc644b, 0x8bcfd4bf, 0xb3debb13,
+ 0xdecfbec6, 0xe6ded16a, 0xaeed619e, 0x96fc0e32,
+ 0x3e8a0076, 0x069b6fda, 0x4ea8df2e, 0x76b9b082,
+ 0x948ad484, 0xac9bbb28, 0xe4a80bdc, 0xdcb96470,
+ 0x74cf6a34, 0x4cde0598, 0x04edb56c, 0x3cfcdac0,
+ 0x51eddf15, 0x69fcb0b9, 0x21cf004d, 0x19de6fe1,
+ 0xb1a861a5, 0x89b90e09, 0xc18abefd, 0xf99bd151,
+ 0x37516aae, 0x0f400502, 0x4773b5f6, 0x7f62da5a,
+ 0xd714d41e, 0xef05bbb2, 0xa7360b46, 0x9f2764ea,
+ 0xf236613f, 0xca270e93, 0x8214be67, 0xba05d1cb,
+ 0x1273df8f, 0x2a62b023, 0x625100d7, 0x5a406f7b,
+ 0xb8730b7d, 0x806264d1, 0xc851d425, 0xf040bb89,
+ 0x5836b5cd, 0x6027da61, 0x28146a95, 0x10050539,
+ 0x7d1400ec, 0x45056f40, 0x0d36dfb4, 0x3527b018,
+ 0x9d51be5c, 0xa540d1f0, 0xed736104, 0xd5620ea8,
+ 0x2cf9dff9, 0x14e8b055, 0x5cdb00a1, 0x64ca6f0d,
+ 0xccbc6149, 0xf4ad0ee5, 0xbc9ebe11, 0x848fd1bd,
+ 0xe99ed468, 0xd18fbbc4, 0x99bc0b30, 0xa1ad649c,
+ 0x09db6ad8, 0x31ca0574, 0x79f9b580, 0x41e8da2c,
+ 0xa3dbbe2a, 0x9bcad186, 0xd3f96172, 0xebe80ede,
+ 0x439e009a, 0x7b8f6f36, 0x33bcdfc2, 0x0badb06e,
+ 0x66bcb5bb, 0x5eadda17, 0x169e6ae3, 0x2e8f054f,
+ 0x86f90b0b, 0xbee864a7, 0xf6dbd453, 0xcecabbff,
+ 0x6ea2d55c, 0x56b3baf0, 0x1e800a04, 0x269165a8,
+ 0x8ee76bec, 0xb6f60440, 0xfec5b4b4, 0xc6d4db18,
+ 0xabc5decd, 0x93d4b161, 0xdbe70195, 0xe3f66e39,
+ 0x4b80607d, 0x73910fd1, 0x3ba2bf25, 0x03b3d089,
+ 0xe180b48f, 0xd991db23, 0x91a26bd7, 0xa9b3047b,
+ 0x01c50a3f, 0x39d46593, 0x71e7d567, 0x49f6bacb,
+ 0x24e7bf1e, 0x1cf6d0b2, 0x54c56046, 0x6cd40fea,
+ 0xc4a201ae, 0xfcb36e02, 0xb480def6, 0x8c91b15a,
+ 0x750a600b, 0x4d1b0fa7, 0x0528bf53, 0x3d39d0ff,
+ 0x954fdebb, 0xad5eb117, 0xe56d01e3, 0xdd7c6e4f,
+ 0xb06d6b9a, 0x887c0436, 0xc04fb4c2, 0xf85edb6e,
+ 0x5028d52a, 0x6839ba86, 0x200a0a72, 0x181b65de,
+ 0xfa2801d8, 0xc2396e74, 0x8a0ade80, 0xb21bb12c,
+ 0x1a6dbf68, 0x227cd0c4, 0x6a4f6030, 0x525e0f9c,
+ 0x3f4f0a49, 0x075e65e5, 0x4f6dd511, 0x777cbabd,
+ 0xdf0ab4f9, 0xe71bdb55, 0xaf286ba1, 0x9739040d,
+ 0x59f3bff2, 0x61e2d05e, 0x29d160aa, 0x11c00f06,
+ 0xb9b60142, 0x81a76eee, 0xc994de1a, 0xf185b1b6,
+ 0x9c94b463, 0xa485dbcf, 0xecb66b3b, 0xd4a70497,
+ 0x7cd10ad3, 0x44c0657f, 0x0cf3d58b, 0x34e2ba27,
+ 0xd6d1de21, 0xeec0b18d, 0xa6f30179, 0x9ee26ed5,
+ 0x36946091, 0x0e850f3d, 0x46b6bfc9, 0x7ea7d065,
+ 0x13b6d5b0, 0x2ba7ba1c, 0x63940ae8, 0x5b856544,
+ 0xf3f36b00, 0xcbe204ac, 0x83d1b458, 0xbbc0dbf4,
+ 0x425b0aa5, 0x7a4a6509, 0x3279d5fd, 0x0a68ba51,
+ 0xa21eb415, 0x9a0fdbb9, 0xd23c6b4d, 0xea2d04e1,
+ 0x873c0134, 0xbf2d6e98, 0xf71ede6c, 0xcf0fb1c0,
+ 0x6779bf84, 0x5f68d028, 0x175b60dc, 0x2f4a0f70,
+ 0xcd796b76, 0xf56804da, 0xbd5bb42e, 0x854adb82,
+ 0x2d3cd5c6, 0x152dba6a, 0x5d1e0a9e, 0x650f6532,
+ 0x081e60e7, 0x300f0f4b, 0x783cbfbf, 0x402dd013,
+ 0xe85bde57, 0xd04ab1fb, 0x9879010f, 0xa0686ea3
+ },{
+ 0x00000000, 0xef306b19, 0xdb8ca0c3, 0x34bccbda,
+ 0xb2f53777, 0x5dc55c6e, 0x697997b4, 0x8649fcad,
+ 0x6006181f, 0x8f367306, 0xbb8ab8dc, 0x54bad3c5,
+ 0xd2f32f68, 0x3dc34471, 0x097f8fab, 0xe64fe4b2,
+ 0xc00c303e, 0x2f3c5b27, 0x1b8090fd, 0xf4b0fbe4,
+ 0x72f90749, 0x9dc96c50, 0xa975a78a, 0x4645cc93,
+ 0xa00a2821, 0x4f3a4338, 0x7b8688e2, 0x94b6e3fb,
+ 0x12ff1f56, 0xfdcf744f, 0xc973bf95, 0x2643d48c,
+ 0x85f4168d, 0x6ac47d94, 0x5e78b64e, 0xb148dd57,
+ 0x370121fa, 0xd8314ae3, 0xec8d8139, 0x03bdea20,
+ 0xe5f20e92, 0x0ac2658b, 0x3e7eae51, 0xd14ec548,
+ 0x570739e5, 0xb83752fc, 0x8c8b9926, 0x63bbf23f,
+ 0x45f826b3, 0xaac84daa, 0x9e748670, 0x7144ed69,
+ 0xf70d11c4, 0x183d7add, 0x2c81b107, 0xc3b1da1e,
+ 0x25fe3eac, 0xcace55b5, 0xfe729e6f, 0x1142f576,
+ 0x970b09db, 0x783b62c2, 0x4c87a918, 0xa3b7c201,
+ 0x0e045beb, 0xe13430f2, 0xd588fb28, 0x3ab89031,
+ 0xbcf16c9c, 0x53c10785, 0x677dcc5f, 0x884da746,
+ 0x6e0243f4, 0x813228ed, 0xb58ee337, 0x5abe882e,
+ 0xdcf77483, 0x33c71f9a, 0x077bd440, 0xe84bbf59,
+ 0xce086bd5, 0x213800cc, 0x1584cb16, 0xfab4a00f,
+ 0x7cfd5ca2, 0x93cd37bb, 0xa771fc61, 0x48419778,
+ 0xae0e73ca, 0x413e18d3, 0x7582d309, 0x9ab2b810,
+ 0x1cfb44bd, 0xf3cb2fa4, 0xc777e47e, 0x28478f67,
+ 0x8bf04d66, 0x64c0267f, 0x507ceda5, 0xbf4c86bc,
+ 0x39057a11, 0xd6351108, 0xe289dad2, 0x0db9b1cb,
+ 0xebf65579, 0x04c63e60, 0x307af5ba, 0xdf4a9ea3,
+ 0x5903620e, 0xb6330917, 0x828fc2cd, 0x6dbfa9d4,
+ 0x4bfc7d58, 0xa4cc1641, 0x9070dd9b, 0x7f40b682,
+ 0xf9094a2f, 0x16392136, 0x2285eaec, 0xcdb581f5,
+ 0x2bfa6547, 0xc4ca0e5e, 0xf076c584, 0x1f46ae9d,
+ 0x990f5230, 0x763f3929, 0x4283f2f3, 0xadb399ea,
+ 0x1c08b7d6, 0xf338dccf, 0xc7841715, 0x28b47c0c,
+ 0xaefd80a1, 0x41cdebb8, 0x75712062, 0x9a414b7b,
+ 0x7c0eafc9, 0x933ec4d0, 0xa7820f0a, 0x48b26413,
+ 0xcefb98be, 0x21cbf3a7, 0x1577387d, 0xfa475364,
+ 0xdc0487e8, 0x3334ecf1, 0x0788272b, 0xe8b84c32,
+ 0x6ef1b09f, 0x81c1db86, 0xb57d105c, 0x5a4d7b45,
+ 0xbc029ff7, 0x5332f4ee, 0x678e3f34, 0x88be542d,
+ 0x0ef7a880, 0xe1c7c399, 0xd57b0843, 0x3a4b635a,
+ 0x99fca15b, 0x76ccca42, 0x42700198, 0xad406a81,
+ 0x2b09962c, 0xc439fd35, 0xf08536ef, 0x1fb55df6,
+ 0xf9fab944, 0x16cad25d, 0x22761987, 0xcd46729e,
+ 0x4b0f8e33, 0xa43fe52a, 0x90832ef0, 0x7fb345e9,
+ 0x59f09165, 0xb6c0fa7c, 0x827c31a6, 0x6d4c5abf,
+ 0xeb05a612, 0x0435cd0b, 0x308906d1, 0xdfb96dc8,
+ 0x39f6897a, 0xd6c6e263, 0xe27a29b9, 0x0d4a42a0,
+ 0x8b03be0d, 0x6433d514, 0x508f1ece, 0xbfbf75d7,
+ 0x120cec3d, 0xfd3c8724, 0xc9804cfe, 0x26b027e7,
+ 0xa0f9db4a, 0x4fc9b053, 0x7b757b89, 0x94451090,
+ 0x720af422, 0x9d3a9f3b, 0xa98654e1, 0x46b63ff8,
+ 0xc0ffc355, 0x2fcfa84c, 0x1b736396, 0xf443088f,
+ 0xd200dc03, 0x3d30b71a, 0x098c7cc0, 0xe6bc17d9,
+ 0x60f5eb74, 0x8fc5806d, 0xbb794bb7, 0x544920ae,
+ 0xb206c41c, 0x5d36af05, 0x698a64df, 0x86ba0fc6,
+ 0x00f3f36b, 0xefc39872, 0xdb7f53a8, 0x344f38b1,
+ 0x97f8fab0, 0x78c891a9, 0x4c745a73, 0xa344316a,
+ 0x250dcdc7, 0xca3da6de, 0xfe816d04, 0x11b1061d,
+ 0xf7fee2af, 0x18ce89b6, 0x2c72426c, 0xc3422975,
+ 0x450bd5d8, 0xaa3bbec1, 0x9e87751b, 0x71b71e02,
+ 0x57f4ca8e, 0xb8c4a197, 0x8c786a4d, 0x63480154,
+ 0xe501fdf9, 0x0a3196e0, 0x3e8d5d3a, 0xd1bd3623,
+ 0x37f2d291, 0xd8c2b988, 0xec7e7252, 0x034e194b,
+ 0x8507e5e6, 0x6a378eff, 0x5e8b4525, 0xb1bb2e3c
+ },{
+ 0x00000000, 0x68032cc8, 0xd0065990, 0xb8057558,
+ 0xa5e0c5d1, 0xcde3e919, 0x75e69c41, 0x1de5b089,
+ 0x4e2dfd53, 0x262ed19b, 0x9e2ba4c3, 0xf628880b,
+ 0xebcd3882, 0x83ce144a, 0x3bcb6112, 0x53c84dda,
+ 0x9c5bfaa6, 0xf458d66e, 0x4c5da336, 0x245e8ffe,
+ 0x39bb3f77, 0x51b813bf, 0xe9bd66e7, 0x81be4a2f,
+ 0xd27607f5, 0xba752b3d, 0x02705e65, 0x6a7372ad,
+ 0x7796c224, 0x1f95eeec, 0xa7909bb4, 0xcf93b77c,
+ 0x3d5b83bd, 0x5558af75, 0xed5dda2d, 0x855ef6e5,
+ 0x98bb466c, 0xf0b86aa4, 0x48bd1ffc, 0x20be3334,
+ 0x73767eee, 0x1b755226, 0xa370277e, 0xcb730bb6,
+ 0xd696bb3f, 0xbe9597f7, 0x0690e2af, 0x6e93ce67,
+ 0xa100791b, 0xc90355d3, 0x7106208b, 0x19050c43,
+ 0x04e0bcca, 0x6ce39002, 0xd4e6e55a, 0xbce5c992,
+ 0xef2d8448, 0x872ea880, 0x3f2bddd8, 0x5728f110,
+ 0x4acd4199, 0x22ce6d51, 0x9acb1809, 0xf2c834c1,
+ 0x7ab7077a, 0x12b42bb2, 0xaab15eea, 0xc2b27222,
+ 0xdf57c2ab, 0xb754ee63, 0x0f519b3b, 0x6752b7f3,
+ 0x349afa29, 0x5c99d6e1, 0xe49ca3b9, 0x8c9f8f71,
+ 0x917a3ff8, 0xf9791330, 0x417c6668, 0x297f4aa0,
+ 0xe6ecfddc, 0x8eefd114, 0x36eaa44c, 0x5ee98884,
+ 0x430c380d, 0x2b0f14c5, 0x930a619d, 0xfb094d55,
+ 0xa8c1008f, 0xc0c22c47, 0x78c7591f, 0x10c475d7,
+ 0x0d21c55e, 0x6522e996, 0xdd279cce, 0xb524b006,
+ 0x47ec84c7, 0x2fefa80f, 0x97eadd57, 0xffe9f19f,
+ 0xe20c4116, 0x8a0f6dde, 0x320a1886, 0x5a09344e,
+ 0x09c17994, 0x61c2555c, 0xd9c72004, 0xb1c40ccc,
+ 0xac21bc45, 0xc422908d, 0x7c27e5d5, 0x1424c91d,
+ 0xdbb77e61, 0xb3b452a9, 0x0bb127f1, 0x63b20b39,
+ 0x7e57bbb0, 0x16549778, 0xae51e220, 0xc652cee8,
+ 0x959a8332, 0xfd99affa, 0x459cdaa2, 0x2d9ff66a,
+ 0x307a46e3, 0x58796a2b, 0xe07c1f73, 0x887f33bb,
+ 0xf56e0ef4, 0x9d6d223c, 0x25685764, 0x4d6b7bac,
+ 0x508ecb25, 0x388de7ed, 0x808892b5, 0xe88bbe7d,
+ 0xbb43f3a7, 0xd340df6f, 0x6b45aa37, 0x034686ff,
+ 0x1ea33676, 0x76a01abe, 0xcea56fe6, 0xa6a6432e,
+ 0x6935f452, 0x0136d89a, 0xb933adc2, 0xd130810a,
+ 0xccd53183, 0xa4d61d4b, 0x1cd36813, 0x74d044db,
+ 0x27180901, 0x4f1b25c9, 0xf71e5091, 0x9f1d7c59,
+ 0x82f8ccd0, 0xeafbe018, 0x52fe9540, 0x3afdb988,
+ 0xc8358d49, 0xa036a181, 0x1833d4d9, 0x7030f811,
+ 0x6dd54898, 0x05d66450, 0xbdd31108, 0xd5d03dc0,
+ 0x8618701a, 0xee1b5cd2, 0x561e298a, 0x3e1d0542,
+ 0x23f8b5cb, 0x4bfb9903, 0xf3feec5b, 0x9bfdc093,
+ 0x546e77ef, 0x3c6d5b27, 0x84682e7f, 0xec6b02b7,
+ 0xf18eb23e, 0x998d9ef6, 0x2188ebae, 0x498bc766,
+ 0x1a438abc, 0x7240a674, 0xca45d32c, 0xa246ffe4,
+ 0xbfa34f6d, 0xd7a063a5, 0x6fa516fd, 0x07a63a35,
+ 0x8fd9098e, 0xe7da2546, 0x5fdf501e, 0x37dc7cd6,
+ 0x2a39cc5f, 0x423ae097, 0xfa3f95cf, 0x923cb907,
+ 0xc1f4f4dd, 0xa9f7d815, 0x11f2ad4d, 0x79f18185,
+ 0x6414310c, 0x0c171dc4, 0xb412689c, 0xdc114454,
+ 0x1382f328, 0x7b81dfe0, 0xc384aab8, 0xab878670,
+ 0xb66236f9, 0xde611a31, 0x66646f69, 0x0e6743a1,
+ 0x5daf0e7b, 0x35ac22b3, 0x8da957eb, 0xe5aa7b23,
+ 0xf84fcbaa, 0x904ce762, 0x2849923a, 0x404abef2,
+ 0xb2828a33, 0xda81a6fb, 0x6284d3a3, 0x0a87ff6b,
+ 0x17624fe2, 0x7f61632a, 0xc7641672, 0xaf673aba,
+ 0xfcaf7760, 0x94ac5ba8, 0x2ca92ef0, 0x44aa0238,
+ 0x594fb2b1, 0x314c9e79, 0x8949eb21, 0xe14ac7e9,
+ 0x2ed97095, 0x46da5c5d, 0xfedf2905, 0x96dc05cd,
+ 0x8b39b544, 0xe33a998c, 0x5b3fecd4, 0x333cc01c,
+ 0x60f48dc6, 0x08f7a10e, 0xb0f2d456, 0xd8f1f89e,
+ 0xc5144817, 0xad1764df, 0x15121187, 0x7d113d4f
+ },{
+ 0x00000000, 0x493c7d27, 0x9278fa4e, 0xdb448769,
+ 0x211d826d, 0x6821ff4a, 0xb3657823, 0xfa590504,
+ 0x423b04da, 0x0b0779fd, 0xd043fe94, 0x997f83b3,
+ 0x632686b7, 0x2a1afb90, 0xf15e7cf9, 0xb86201de,
+ 0x847609b4, 0xcd4a7493, 0x160ef3fa, 0x5f328edd,
+ 0xa56b8bd9, 0xec57f6fe, 0x37137197, 0x7e2f0cb0,
+ 0xc64d0d6e, 0x8f717049, 0x5435f720, 0x1d098a07,
+ 0xe7508f03, 0xae6cf224, 0x7528754d, 0x3c14086a,
+ 0x0d006599, 0x443c18be, 0x9f789fd7, 0xd644e2f0,
+ 0x2c1de7f4, 0x65219ad3, 0xbe651dba, 0xf759609d,
+ 0x4f3b6143, 0x06071c64, 0xdd439b0d, 0x947fe62a,
+ 0x6e26e32e, 0x271a9e09, 0xfc5e1960, 0xb5626447,
+ 0x89766c2d, 0xc04a110a, 0x1b0e9663, 0x5232eb44,
+ 0xa86bee40, 0xe1579367, 0x3a13140e, 0x732f6929,
+ 0xcb4d68f7, 0x827115d0, 0x593592b9, 0x1009ef9e,
+ 0xea50ea9a, 0xa36c97bd, 0x782810d4, 0x31146df3,
+ 0x1a00cb32, 0x533cb615, 0x8878317c, 0xc1444c5b,
+ 0x3b1d495f, 0x72213478, 0xa965b311, 0xe059ce36,
+ 0x583bcfe8, 0x1107b2cf, 0xca4335a6, 0x837f4881,
+ 0x79264d85, 0x301a30a2, 0xeb5eb7cb, 0xa262caec,
+ 0x9e76c286, 0xd74abfa1, 0x0c0e38c8, 0x453245ef,
+ 0xbf6b40eb, 0xf6573dcc, 0x2d13baa5, 0x642fc782,
+ 0xdc4dc65c, 0x9571bb7b, 0x4e353c12, 0x07094135,
+ 0xfd504431, 0xb46c3916, 0x6f28be7f, 0x2614c358,
+ 0x1700aeab, 0x5e3cd38c, 0x857854e5, 0xcc4429c2,
+ 0x361d2cc6, 0x7f2151e1, 0xa465d688, 0xed59abaf,
+ 0x553baa71, 0x1c07d756, 0xc743503f, 0x8e7f2d18,
+ 0x7426281c, 0x3d1a553b, 0xe65ed252, 0xaf62af75,
+ 0x9376a71f, 0xda4ada38, 0x010e5d51, 0x48322076,
+ 0xb26b2572, 0xfb575855, 0x2013df3c, 0x692fa21b,
+ 0xd14da3c5, 0x9871dee2, 0x4335598b, 0x0a0924ac,
+ 0xf05021a8, 0xb96c5c8f, 0x6228dbe6, 0x2b14a6c1,
+ 0x34019664, 0x7d3deb43, 0xa6796c2a, 0xef45110d,
+ 0x151c1409, 0x5c20692e, 0x8764ee47, 0xce589360,
+ 0x763a92be, 0x3f06ef99, 0xe44268f0, 0xad7e15d7,
+ 0x572710d3, 0x1e1b6df4, 0xc55fea9d, 0x8c6397ba,
+ 0xb0779fd0, 0xf94be2f7, 0x220f659e, 0x6b3318b9,
+ 0x916a1dbd, 0xd856609a, 0x0312e7f3, 0x4a2e9ad4,
+ 0xf24c9b0a, 0xbb70e62d, 0x60346144, 0x29081c63,
+ 0xd3511967, 0x9a6d6440, 0x4129e329, 0x08159e0e,
+ 0x3901f3fd, 0x703d8eda, 0xab7909b3, 0xe2457494,
+ 0x181c7190, 0x51200cb7, 0x8a648bde, 0xc358f6f9,
+ 0x7b3af727, 0x32068a00, 0xe9420d69, 0xa07e704e,
+ 0x5a27754a, 0x131b086d, 0xc85f8f04, 0x8163f223,
+ 0xbd77fa49, 0xf44b876e, 0x2f0f0007, 0x66337d20,
+ 0x9c6a7824, 0xd5560503, 0x0e12826a, 0x472eff4d,
+ 0xff4cfe93, 0xb67083b4, 0x6d3404dd, 0x240879fa,
+ 0xde517cfe, 0x976d01d9, 0x4c2986b0, 0x0515fb97,
+ 0x2e015d56, 0x673d2071, 0xbc79a718, 0xf545da3f,
+ 0x0f1cdf3b, 0x4620a21c, 0x9d642575, 0xd4585852,
+ 0x6c3a598c, 0x250624ab, 0xfe42a3c2, 0xb77edee5,
+ 0x4d27dbe1, 0x041ba6c6, 0xdf5f21af, 0x96635c88,
+ 0xaa7754e2, 0xe34b29c5, 0x380faeac, 0x7133d38b,
+ 0x8b6ad68f, 0xc256aba8, 0x19122cc1, 0x502e51e6,
+ 0xe84c5038, 0xa1702d1f, 0x7a34aa76, 0x3308d751,
+ 0xc951d255, 0x806daf72, 0x5b29281b, 0x1215553c,
+ 0x230138cf, 0x6a3d45e8, 0xb179c281, 0xf845bfa6,
+ 0x021cbaa2, 0x4b20c785, 0x906440ec, 0xd9583dcb,
+ 0x613a3c15, 0x28064132, 0xf342c65b, 0xba7ebb7c,
+ 0x4027be78, 0x091bc35f, 0xd25f4436, 0x9b633911,
+ 0xa777317b, 0xee4b4c5c, 0x350fcb35, 0x7c33b612,
+ 0x866ab316, 0xcf56ce31, 0x14124958, 0x5d2e347f,
+ 0xe54c35a1, 0xac704886, 0x7734cfef, 0x3e08b2c8,
+ 0xc451b7cc, 0x8d6dcaeb, 0x56294d82, 0x1f1530a5
+ }
+#endif
+};
+
+/*
+ * __wt_cksum_sw --
+ * Return a checksum for a chunk of memory, computed in software.
+ *
+ * Slicing-by-8 algorithm by Michael E. Kounavis and Frank L. Berry from
+ * Intel Corp.:
+ * http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+ *
+ * Based on Peter Kanowski's posting:
+ * http://www.strchr.com/crc32_popcnt
+ *
+ * The big endian version calculates the same result at each step, except the
+ * value of the crc is byte reversed from what it would be at that step for
+ * little endian.
+ */
+static uint32_t
+__wt_cksum_sw(const void *chunk, size_t len)
+{
+ uint32_t crc, next;
+ size_t nqwords;
+ const uint8_t *p;
+
+ crc = 0xffffffff;
+
+ /* Checksum one byte at a time to the first 4B boundary. */
+ for (p = chunk;
+ ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+ len > 0; ++p, --len)
+#ifdef WORDS_BIGENDIAN
+ crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+#else
+ crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+#endif
+
+ /* Checksum in 8B chunks. */
+ for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+ crc ^= *(uint32_t *)p;
+ p += sizeof(uint32_t);
+ next = *(uint32_t *)p;
+ p += sizeof(uint32_t);
+ crc =
+#ifdef WORDS_BIGENDIAN
+ g_crc_slicing[4][(crc ) & 0xFF] ^
+ g_crc_slicing[5][(crc >> 8) & 0xFF] ^
+ g_crc_slicing[6][(crc >> 16) & 0xFF] ^
+ g_crc_slicing[7][(crc >> 24)] ^
+ g_crc_slicing[0][(next ) & 0xFF] ^
+ g_crc_slicing[1][(next >> 8) & 0xFF] ^
+ g_crc_slicing[2][(next >> 16) & 0xFF] ^
+ g_crc_slicing[3][(next >> 24)];
+#else
+ g_crc_slicing[7][(crc ) & 0xFF] ^
+ g_crc_slicing[6][(crc >> 8) & 0xFF] ^
+ g_crc_slicing[5][(crc >> 16) & 0xFF] ^
+ g_crc_slicing[4][(crc >> 24)] ^
+ g_crc_slicing[3][(next ) & 0xFF] ^
+ g_crc_slicing[2][(next >> 8) & 0xFF] ^
+ g_crc_slicing[1][(next >> 16) & 0xFF] ^
+ g_crc_slicing[0][(next >> 24)];
+#endif
+ }
+
+ /* Checksum trailing bytes one byte at a time. */
+#ifdef WORDS_BIGENDIAN
+ for (len &= 0x7; len > 0; ++p, len--)
+ crc = g_crc_slicing[0][((crc >> 24) ^ *p) & 0xFF] ^ (crc << 8);
+
+ /* Do final byte swap to produce a result identical to little endian */
+ crc =
+ ((crc << 24) & 0xFF000000) |
+ ((crc << 8) & 0x00FF0000) |
+ ((crc >> 8) & 0x0000FF00) |
+ ((crc >> 24) & 0x000000FF);
+#else
+ for (len &= 0x7; len > 0; ++p, len--)
+ crc = g_crc_slicing[0][(crc ^ *p) & 0xFF] ^ (crc >> 8);
+#endif
+ return (~crc);
+}
+
+#if (defined(__amd64) || defined(__x86_64))
+/*
+ * __wt_cksum_hw --
+ * Return a checksum for a chunk of memory, computed in hardware
+ * using 8 byte steps.
+ */
+static uint32_t
+__wt_cksum_hw(const void *chunk, size_t len)
+{
+ uint32_t crc;
+ size_t nqwords;
+ const uint8_t *p;
+ const uint64_t *p64;
+
+ crc = 0xffffffff;
+
+ /* Checksum one byte at a time to the first 4B boundary. */
+ for (p = chunk;
+ ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+ len > 0; ++p, --len) {
+ __asm__ __volatile__(
+ ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+ : "=S" (crc)
+ : "0" (crc), "c" (*p));
+ }
+
+ p64 = (const uint64_t *)p;
+ /* Checksum in 8B chunks. */
+ for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+ __asm__ __volatile__ (
+ ".byte 0xF2, 0x48, 0x0F, 0x38, 0xF1, 0xF1"
+ : "=S"(crc)
+ : "0"(crc), "c" (*p64));
+ p64++;
+ }
+
+ /* Checksum trailing bytes one byte at a time. */
+ p = (const uint8_t *)p64;
+ for (len &= 0x7; len > 0; ++p, len--) {
+ __asm__ __volatile__(
+ ".byte 0xF2, 0x0F, 0x38, 0xF0, 0xF1"
+ : "=S" (crc)
+ : "0" (crc), "c" (*p));
+ }
+ return (~crc);
+}
+#endif
+
+#if defined(_M_AMD64)
+/*
+ * __wt_cksum_hw --
+ * Return a checksum for a chunk of memory, computed in hardware
+ * using 8 byte steps.
+ */
+static uint32_t
+__wt_cksum_hw(const void *chunk, size_t len)
+{
+ uint32_t crc;
+ size_t nqwords;
+ const uint8_t *p;
+ const uint64_t *p64;
+
+ crc = 0xffffffff;
+
+ /* Checksum one byte at a time to the first 4B boundary. */
+ for (p = chunk;
+ ((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0 &&
+ len > 0; ++p, --len) {
+ crc = _mm_crc32_u8(crc, *p);
+ }
+
+ p64 = (const uint64_t *)p;
+ /* Checksum in 8B chunks. */
+ for (nqwords = len / sizeof(uint64_t); nqwords; nqwords--) {
+ crc = (uint32_t)_mm_crc32_u64(crc, *p64);
+ p64++;
+ }
+
+ /* Checksum trailing bytes one byte at a time. */
+ p = (const uint8_t *)p64;
+ for (len &= 0x7; len > 0; ++p, len--) {
+ crc = _mm_crc32_u8(crc, *p);
+ }
+
+ return (~crc);
+}
+#endif
+
+/*
+ * __wt_cksum --
+ * Return a checksum for a chunk of memory using the fastest method
+ * available.
+ */
+uint32_t
+__wt_cksum(const void *chunk, size_t len)
+{
+ return (*__wt_cksum_func)(chunk, len);
+}
+
+/*
+ * __wt_cksum_init --
+ * Detect CRC hardware and set the checksum function.
+ */
+void
+__wt_cksum_init(void)
+{
+#define CPUID_ECX_HAS_SSE42 (1 << 20)
+
+#if (defined(__amd64) || defined(__x86_64))
+ unsigned int eax, ebx, ecx, edx;
+
+ __asm__ __volatile__ (
+ "cpuid"
+ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+ : "a" (1));
+
+ if (ecx & CPUID_ECX_HAS_SSE42)
+ __wt_cksum_func = __wt_cksum_hw;
+ else
+ __wt_cksum_func = __wt_cksum_sw;
+
+#elif defined(_M_AMD64)
+ int cpuInfo[4];
+
+ __cpuid(cpuInfo, 1);
+
+ if (cpuInfo[2] & CPUID_ECX_HAS_SSE42)
+ __wt_cksum_func = __wt_cksum_hw;
+ else
+ __wt_cksum_func = __wt_cksum_sw;
+#else
+ __wt_cksum_func = __wt_cksum_sw;
+#endif
+}
diff --git a/src/third_party/wiredtiger/src/support/err.c b/src/third_party/wiredtiger/src/support/err.c
new file mode 100644
index 00000000000..3e874078fbf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/err.c
@@ -0,0 +1,527 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __handle_error_default --
+ * Default WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *errmsg)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(error);
+
+ return (fprintf(stderr, "%s\n", errmsg) >= 0 &&
+ fflush(stderr) == 0 ? 0 : __wt_errno());
+}
+
+/*
+ * __handle_message_default --
+ * Default WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *message)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ return (printf("%s\n", message) >= 0 &&
+ fflush(stdout) == 0 ? 0 : __wt_errno());
+}
+
+/*
+ * __handle_progress_default --
+ * Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *operation, uint64_t progress)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(operation);
+ WT_UNUSED(progress);
+
+ return (0);
+}
+
+/*
+ * __handle_close_default --
+ * Default WT_EVENT_HANDLER->handle_close implementation: ignore.
+ */
+static int
+__handle_close_default(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, WT_CURSOR *cursor)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(cursor);
+
+ return (0);
+}
+
+static WT_EVENT_HANDLER __event_handler_default = {
+ __handle_error_default,
+ __handle_message_default,
+ __handle_progress_default,
+ __handle_close_default
+};
+
+/*
+ * __handler_failure --
+ * Report the failure of an application-configured event handler.
+ */
+static void
+__handler_failure(WT_SESSION_IMPL *session,
+ int error, const char *which, int error_handler_failed)
+{
+ WT_EVENT_HANDLER *handler;
+ WT_SESSION *wt_session;
+
+ /*
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[256];
+
+ (void)snprintf(s, sizeof(s),
+ "application %s event handler failed: %s",
+ which, wiredtiger_strerror(error));
+
+ /*
+ * Use the error handler to report the failure, unless it was the error
+ * handler that failed. If it was the error handler that failed, or a
+ * call to the error handler fails, use the default error handler.
+ */
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ if (!error_handler_failed &&
+ handler->handle_error != __handle_error_default &&
+ handler->handle_error(handler, wt_session, error, s) == 0)
+ return;
+
+ (void)__handle_error_default(NULL, wt_session, error, s);
+}
+
+/*
+ * __wt_event_handler_set --
+ * Set an event handler, fill in any NULL methods with the defaults.
+ */
+void
+__wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler)
+{
+ if (handler == NULL)
+ handler = &__event_handler_default;
+ else {
+ if (handler->handle_error == NULL)
+ handler->handle_error = __handle_error_default;
+ if (handler->handle_message == NULL)
+ handler->handle_message = __handle_message_default;
+ if (handler->handle_progress == NULL)
+ handler->handle_progress = __handle_progress_default;
+ }
+
+ session->event_handler = handler;
+}
+
+/*
+ * __wt_eventv --
+ * Report a message to an event handler.
+ */
+int
+__wt_eventv(WT_SESSION_IMPL *session, int msg_event, int error,
+ const char *file_name, int line_number, const char *fmt, va_list ap)
+{
+ WT_EVENT_HANDLER *handler;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+ struct timespec ts;
+ size_t len, remain, wlen;
+ int prefix_cnt;
+ const char *err, *prefix;
+ char *end, *p, tid[128];
+
+ /*
+ * We're using a stack buffer because we want error messages no matter
+ * what, and allocating a WT_ITEM, or the memory it needs, might fail.
+ *
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[2048];
+
+ /*
+ * !!!
+ * This function MUST handle a NULL WT_SESSION_IMPL handle.
+ *
+ * Without a session, we don't have event handlers or prefixes for the
+ * error message. Write the error to stderr and call it a day. (It's
+ * almost impossible for that to happen given how early we allocate the
+ * first session, but if the allocation of the first session fails, for
+ * example, we can end up here without a session.)
+ */
+ if (session == NULL)
+ return (fprintf(stderr, "WiredTiger Error%s%s\n",
+ error == 0 ? "" : ": ",
+ error == 0 ? "" : wiredtiger_strerror(error)) >= 0 &&
+ fflush(stderr) == 0 ? 0 : __wt_errno());
+
+ p = s;
+ end = s + sizeof(s);
+
+ /*
+ * We have several prefixes for the error message:
+ * a timestamp and the process and thread ids, the database error
+ * prefix, the data-source's name, and the session's name. Write them
+ * as a comma-separate list, followed by a colon.
+ */
+ prefix_cnt = 0;
+ if (__wt_epoch(session, &ts) == 0) {
+ __wt_thread_id(tid, sizeof(tid));
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "[%" PRIuMAX ":%" PRIuMAX "][%s]",
+ (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ if ((prefix = S2C(session)->error_prefix) != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ prefix = session->dhandle == NULL ? NULL : session->dhandle->name;
+ if (prefix != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ if ((prefix = session->name) != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain,
+ "%s%s", prefix_cnt == 0 ? "" : ", ", prefix);
+ p = wlen >= remain ? end : p + wlen;
+ prefix_cnt = 1;
+ }
+ if (prefix_cnt != 0) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)snprintf(p, remain, ": ");
+ p = wlen >= remain ? end : p + wlen;
+ }
+
+ if (file_name != NULL) {
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)
+ snprintf(p, remain, "%s, %d: ", file_name, line_number);
+ p = wlen >= remain ? end : p + wlen;
+ }
+
+ remain = WT_PTRDIFF(end, p);
+ wlen = (size_t)vsnprintf(p, remain, fmt, ap);
+ p = wlen >= remain ? end : p + wlen;
+
+ if (error != 0) {
+ /*
+ * When the engine calls __wt_err on error, it often outputs an
+ * error message including the string associated with the error
+ * it's returning. We could change the calls to call __wt_errx,
+ * but it's simpler to not append an error string if all we are
+ * doing is duplicating an existing error string.
+ *
+ * Use strcmp to compare: both strings are nul-terminated, and
+ * we don't want to run past the end of the buffer.
+ */
+ err = wiredtiger_strerror(error);
+ len = strlen(err);
+ if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) {
+ remain = WT_PTRDIFF(end, p);
+ (void)snprintf(p, remain, ": %s", err);
+ }
+ }
+
+ /*
+ * If a handler fails, return the error status: if we're in the process
+ * of handling an error, any return value we provide will be ignored by
+ * our caller, our caller presumably already has an error value it will
+ * be returning.
+ *
+ * If an application-specified or default informational message handler
+ * fails, complain using the application-specified or default error
+ * handler.
+ *
+ * If an application-specified error message handler fails, complain
+ * using the default error handler. If the default error handler fails,
+ * there's nothing to do.
+ */
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ if (msg_event) {
+ ret = handler->handle_message(handler, wt_session, s);
+ if (ret != 0)
+ __handler_failure(session, ret, "message", 0);
+ } else {
+ ret = handler->handle_error(handler, wt_session, error, s);
+ if (ret != 0 && handler->handle_error != __handle_error_default)
+ __handler_failure(session, ret, "error", 1);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_err --
+ * Report an error.
+ */
+void
+__wt_err(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+
+ /*
+ * Ignore error returns from underlying event handlers, we already have
+ * an error value to return.
+ */
+ va_start(ap, fmt);
+ (void)__wt_eventv(session, 0, error, NULL, 0, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_errx --
+ * Report an error with no error code.
+ */
+void
+__wt_errx(WT_SESSION_IMPL *session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+ va_list ap;
+
+ /*
+ * Ignore error returns from underlying event handlers, we already have
+ * an error value to return.
+ */
+ va_start(ap, fmt);
+ (void)__wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * __wt_ext_err_printf --
+ * Extension API call to print to the error stream.
+ */
+int
+__wt_ext_err_printf(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = __wt_eventv(session, 0, 0, NULL, 0, fmt, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * info_msg --
+ * Informational message.
+ */
+static int
+info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap)
+{
+ WT_EVENT_HANDLER *handler;
+ WT_SESSION *wt_session;
+
+ /*
+ * !!!
+ * SECURITY:
+ * Buffer placed at the end of the stack in case snprintf overflows.
+ */
+ char s[2048];
+
+ (void)vsnprintf(s, sizeof(s), fmt, ap);
+
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ return (handler->handle_message(handler, wt_session, s));
+}
+
+/*
+ * __wt_msg --
+ * Informational message.
+ */
+int
+__wt_msg(WT_SESSION_IMPL *session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3)))
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = info_msg(session, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_ext_msg_printf --
+ * Extension API call to print to the message stream.
+ */
+int
+__wt_ext_msg_printf(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ va_list ap;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ va_start(ap, fmt);
+ ret = info_msg(session, fmt, ap);
+ va_end(ap);
+ return (ret);
+}
+
+/*
+ * __wt_progress --
+ * Progress message.
+ */
+int
+__wt_progress(WT_SESSION_IMPL *session, const char *s, uint64_t v)
+{
+ WT_DECL_RET;
+ WT_EVENT_HANDLER *handler;
+ WT_SESSION *wt_session;
+
+ wt_session = (WT_SESSION *)session;
+ handler = session->event_handler;
+ if (handler != NULL && handler->handle_progress != NULL)
+ if ((ret = handler->handle_progress(handler,
+ wt_session, s == NULL ? session->name : s, v)) != 0)
+ __handler_failure(session, ret, "progress", 0);
+ return (0);
+}
+
+/*
+ * __wt_assert --
+ * Assert and other unexpected failures, includes file/line information
+ * for debugging.
+ */
+void
+__wt_assert(WT_SESSION_IMPL *session,
+ int error, const char *file_name, int line_number, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 5, 6)))
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void)__wt_eventv(session, 0, error, file_name, line_number, fmt, ap);
+ va_end(ap);
+
+#ifdef HAVE_DIAGNOSTIC
+ __wt_abort(session); /* Drop core if testing. */
+ /* NOTREACHED */
+#endif
+}
+
+/*
+ * __wt_panic --
+ * A standard error message when we panic.
+ */
+int
+__wt_panic(WT_SESSION_IMPL *session)
+{
+ F_SET(S2C(session), WT_CONN_PANIC);
+ __wt_errx(session, "%s",
+ "the WiredTiger library cannot continue; the process must exit "
+ "and restart");
+
+#if !defined(HAVE_DIAGNOSTIC)
+ /*
+ * Chaos reigns within.
+ * Reflect, repent, and reboot.
+ * Order shall return.
+ */
+ return (WT_PANIC);
+#endif
+
+ __wt_abort(session); /* Drop core if testing. */
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_illegal_value --
+ * A standard error message when we detect an illegal value.
+ */
+int
+__wt_illegal_value(WT_SESSION_IMPL *session, const char *name)
+{
+ __wt_errx(session, "%s%s%s",
+ name == NULL ? "" : name, name == NULL ? "" : ": ",
+ "encountered an illegal file format or internal value");
+
+#if !defined(HAVE_DIAGNOSTIC)
+ return (__wt_panic(session));
+#endif
+
+ __wt_abort(session); /* Drop core if testing. */
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_object_unsupported --
+ * Print a standard error message for an object that doesn't support a
+ * particular operation.
+ */
+int
+__wt_object_unsupported(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_RET_MSG(session, ENOTSUP, "unsupported object operation: %s", uri);
+}
+
+/*
+ * __wt_bad_object_type --
+ * Print a standard error message when given an unknown or unsupported
+ * object type.
+ */
+int
+__wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri)
+{
+ if (WT_PREFIX_MATCH(uri, "backup:") ||
+ WT_PREFIX_MATCH(uri, "colgroup:") ||
+ WT_PREFIX_MATCH(uri, "config:") ||
+ WT_PREFIX_MATCH(uri, "file:") ||
+ WT_PREFIX_MATCH(uri, "index:") ||
+ WT_PREFIX_MATCH(uri, "log:") ||
+ WT_PREFIX_MATCH(uri, "lsm:") ||
+ WT_PREFIX_MATCH(uri, "statistics:") ||
+ WT_PREFIX_MATCH(uri, "table:"))
+ return (__wt_object_unsupported(session, uri));
+
+ WT_RET_MSG(session, ENOTSUP, "unknown object type: %s", uri);
+}
diff --git a/src/third_party/wiredtiger/src/support/filename.c b/src/third_party/wiredtiger/src/support/filename.c
new file mode 100644
index 00000000000..bd5d03fa633
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/filename.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_filename --
+ * Build a file name in a scratch buffer, automatically calculate the
+ * length of the file name.
+ */
+int
+__wt_filename(WT_SESSION_IMPL *session, const char *name, char **path)
+{
+ return (__wt_nfilename(session, name, strlen(name), path));
+}
+
+/*
+ * __wt_nfilename --
+ * Build a file name in a scratch buffer. If the name is already an
+ * absolute path duplicate it, otherwise generate a path relative to the
+ * connection home directory.
+ */
+int
+__wt_nfilename(
+ WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path)
+{
+ WT_CONNECTION_IMPL *conn;
+ size_t len;
+ char *buf;
+
+ conn = S2C(session);
+ *path = NULL;
+
+ if (__wt_absolute_path(name))
+ WT_RET(__wt_strndup(session, name, namelen, path));
+ else {
+ len = strlen(conn->home) + 1 + namelen + 1;
+ WT_RET(__wt_calloc(session, 1, len, &buf));
+ snprintf(buf, len, "%s%s%.*s",
+ conn->home, __wt_path_separator(), (int)namelen, name);
+ *path = buf;
+ }
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/global.c b/src/third_party/wiredtiger/src/support/global.c
new file mode 100644
index 00000000000..10f718d57f7
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/global.c
@@ -0,0 +1,118 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+WT_PROCESS __wt_process; /* Per-process structure */
+static int __wt_pthread_once_failed; /* If initialization failed */
+
+/*
+ * __system_is_little_endian --
+ * Check if the system is little endian.
+ */
+static int
+__system_is_little_endian(void)
+{
+ uint64_t v;
+ int little;
+
+ v = 1;
+ little = *((uint8_t *)&v) == 0 ? 0 : 1;
+
+ if (little)
+ return (0);
+
+ fprintf(stderr,
+ "This release of the WiredTiger data engine does not support "
+ "big-endian systems; contact WiredTiger for more information.\n");
+ return (EINVAL);
+}
+
+/*
+ * __wt_global_once --
+ * Global initialization, run once.
+ */
+static void
+__wt_global_once(void)
+{
+ WT_DECL_RET;
+
+ if ((ret = __system_is_little_endian()) != 0) {
+ __wt_pthread_once_failed = ret;
+ return;
+ }
+
+ if ((ret =
+ __wt_spin_init(NULL, &__wt_process.spinlock, "global")) != 0) {
+ __wt_pthread_once_failed = ret;
+ return;
+ }
+
+ __wt_cksum_init();
+
+ TAILQ_INIT(&__wt_process.connqh);
+
+#ifdef HAVE_DIAGNOSTIC
+ /* Load debugging code the compiler might optimize out. */
+ (void)__wt_breakpoint();
+#endif
+}
+
+/*
+ * __wt_library_init --
+ * Some things to do, before we do anything else.
+ */
+int
+__wt_library_init(void)
+{
+ static int first = 1;
+ WT_DECL_RET;
+
+ /*
+ * Do per-process initialization once, before anything else, but only
+ * once. I don't know how heavy-weight the function (pthread_once, in
+ * the POSIX world), might be, so I'm front-ending it with a local
+ * static and only using that function to avoid a race.
+ */
+ if (first) {
+ if ((ret = __wt_once(__wt_global_once)) != 0)
+ __wt_pthread_once_failed = ret;
+ first = 0;
+ }
+ return (__wt_pthread_once_failed);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_breakpoint --
+ * A simple place to put a breakpoint, if you need one.
+ */
+int
+__wt_breakpoint(void)
+{
+ return (0);
+}
+
+/*
+ * __wt_attach --
+ * A routine to wait for the debugging to attach.
+ */
+void
+__wt_attach(WT_SESSION_IMPL *session)
+{
+#ifdef HAVE_ATTACH
+ __wt_errx(session, "process ID %" PRIdMAX
+ ": waiting for debugger...", (intmax_t)getpid());
+
+ /* Sleep forever, the debugger will interrupt us when it attaches. */
+ for (;;)
+ __wt_sleep(100, 0);
+#else
+ WT_UNUSED(session);
+#endif
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/support/hash_city.c b/src/third_party/wiredtiger/src/support/hash_city.c
new file mode 100644
index 00000000000..c6978f6bfe6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hash_city.c
@@ -0,0 +1,323 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Copyright (c) 2011 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * CityHash, by Geoff Pike and Jyrki Alakuijala
+ *
+ * This file provides CityHash64() and related functions.
+ *
+ * It's probably possible to create even faster hash functions by
+ * writing a program that systematically explores some of the space of
+ * possible hash functions, by using SIMD instructions, or by
+ * compromising on hash quality.
+ */
+
+#include <string.h>
+#include "wt_internal.h"
+
+/*
+ * Google City Hash implementation. Based on source code from:
+ * http://code.google.com/p/cityhash/
+ */
+
+typedef struct _uint128 uint128;
+struct _uint128 {
+ uint64_t first;
+ uint64_t second;
+};
+
+#define Uint128Low64(x) (x).first
+#define Uint128High64(x) (x).second
+
+static uint64_t UNALIGNED_LOAD64(const char *p) {
+ uint64_t result;
+ memcpy(&result, p, sizeof(result));
+ return (result);
+}
+
+static uint32_t UNALIGNED_LOAD32(const char *p) {
+ uint32_t result;
+ memcpy(&result, p, sizeof(result));
+ return (result);
+}
+
+#if !defined(WORDS_BIGENDIAN)
+
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (x)
+
+#else
+
+#ifdef __APPLE__
+/* Mac OS X / Darwin features */
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#else
+#include <byteswap.h>
+#endif
+
+#define uint32_in_expected_order(x) (bswap_32(x))
+#define uint64_in_expected_order(x) (bswap_64(x))
+
+#endif /* WORDS_BIGENDIAN */
+
+static uint64_t Fetch64(const char *p) {
+ return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+
+static uint32_t Fetch32(const char *p) {
+ return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+
+/* Some primes between 2^63 and 2^64 for various uses. */
+static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64_t k1 = 0xb492b66fbe98f273ULL;
+static const uint64_t k2 = 0x9ae16a3b2f90404fULL;
+static const uint64_t k3 = 0xc949d7c7509e6557ULL;
+
+/*
+ * Hash 128 input bits down to 64 bits of output.
+ * This is intended to be a reasonably good hash function.
+ */
+static inline uint64_t Hash128to64(const uint128 x) {
+ /* Murmur-inspired hashing. */
+ const uint64_t kMul = 0x9ddfea08eb382d69ULL;
+ uint64_t a, b;
+
+ a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+ a ^= (a >> 47);
+ b = (Uint128High64(x) ^ a) * kMul;
+ b ^= (b >> 47);
+ b *= kMul;
+ return (b);
+}
+
+/*
+ * Bitwise right rotate. Normally this will compile to a single
+ * instruction, especially if the shift is a manifest constant.
+ */
+static uint64_t Rotate(uint64_t val, int shift) {
+ /* Avoid shifting by 64: doing so yields an undefined result. */
+ return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+/*
+ * Equivalent to Rotate(), but requires the second arg to be non-zero.
+ * On x86-64, and probably others, it's possible for this to compile
+ * to a single instruction if both args are already in registers.
+ */
+static uint64_t RotateByAtLeast1(uint64_t val, int shift) {
+ return (val >> shift) | (val << (64 - shift));
+}
+
+static uint64_t ShiftMix(uint64_t val) {
+ return val ^ (val >> 47);
+}
+
+static uint64_t HashLen16(uint64_t u, uint64_t v) {
+ uint128 result;
+
+ result.first = u;
+ result.second = v;
+ return Hash128to64(result);
+}
+
+static uint64_t HashLen0to16(const char *s, size_t len) {
+ uint64_t a64, b64;
+ uint32_t y, z;
+ uint8_t a8, b8, c8;
+ if (len > 8) {
+ a64 = Fetch64(s);
+ b64 = Fetch64(s + len - 8);
+ return HashLen16(
+ a64, RotateByAtLeast1(b64 + len, (int)len)) ^ b64;
+ }
+ if (len >= 4) {
+ a64 = Fetch32(s);
+ return HashLen16(len + (a64 << 3), Fetch32(s + len - 4));
+ }
+ if (len > 0) {
+ a8 = (uint8_t)s[0];
+ b8 = (uint8_t)s[len >> 1];
+ c8 = (uint8_t)s[len - 1];
+ y = (uint32_t)(a8) + ((uint32_t)(b8) << 8);
+ z = (uint32_t)len + ((uint32_t)(c8) << 2);
+ return ShiftMix(y * k2 ^ z * k3) * k2;
+ }
+ return (k2);
+}
+
+/*
+ * This probably works well for 16-byte strings as well, but it may be overkill
+ * in that case.
+ */
+static uint64_t HashLen17to32(const char *s, size_t len) {
+ uint64_t a = Fetch64(s) * k1;
+ uint64_t b = Fetch64(s + 8);
+ uint64_t c = Fetch64(s + len - 8) * k2;
+ uint64_t d = Fetch64(s + len - 16) * k0;
+ return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
+ a + Rotate(b ^ k3, 20) + len - c);
+}
+
+/*
+ * Return a 16-byte hash for 48 bytes. Quick and dirty.
+ * Callers do best to use "random-looking" values for a and b.
+ * static pair<uint64, uint64> WeakHashLen32WithSeeds(
+ */
+static void WeakHashLen32WithSeeds6(uint64_t w, uint64_t x,
+ uint64_t y, uint64_t z, uint64_t a, uint64_t b, uint128 *ret) {
+ uint64_t c;
+
+ a += w;
+ b = Rotate(b + a + z, 21);
+ c = a;
+ a += x;
+ a += y;
+ b += Rotate(a, 44);
+
+ ret->first = (uint64_t) (a + z);
+ ret->second = (uint64_t) (b + c);
+}
+
+/*
+ * Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty.
+ * static pair<uint64, uint64> WeakHashLen32WithSeeds(
+ */
+static void WeakHashLen32WithSeeds(
+ const char* s, uint64_t a, uint64_t b, uint128 *ret) {
+ WeakHashLen32WithSeeds6(Fetch64(s),
+ Fetch64(s + 8),
+ Fetch64(s + 16),
+ Fetch64(s + 24),
+ a,
+ b,
+ ret);
+}
+
+/* Return an 8-byte hash for 33 to 64 bytes. */
+static uint64_t HashLen33to64(const char *s, size_t len) {
+ uint64_t a, b, c, r, vf, vs, wf, ws, z;
+ z = Fetch64(s + 24);
+ a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
+ b = Rotate(a + z, 52);
+ c = Rotate(a, 37);
+ a += Fetch64(s + 8);
+ c += Rotate(a, 7);
+ a += Fetch64(s + 16);
+ vf = a + z;
+ vs = b + Rotate(a, 31) + c;
+ a = Fetch64(s + 16) + Fetch64(s + len - 32);
+ z = Fetch64(s + len - 8);
+ b = Rotate(a + z, 52);
+ c = Rotate(a, 37);
+ a += Fetch64(s + len - 24);
+ c += Rotate(a, 7);
+ a += Fetch64(s + len - 16);
+ wf = a + z;
+ ws = b + Rotate(a, 31) + c;
+ r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
+ return ShiftMix(r * k0 + vs) * k2;
+}
+
+static inline uint64_t CityHash64(const char *s, size_t len) {
+ uint64_t temp, x, y, z;
+ uint128 v, w;
+
+ if (len <= 32) {
+ if (len <= 16) {
+ return HashLen0to16(s, len);
+ } else {
+ return HashLen17to32(s, len);
+ }
+ } else if (len <= 64) {
+ return HashLen33to64(s, len);
+ }
+
+ /*
+ * For strings over 64 bytes we hash the end first, and then as we
+ * loop we keep 56 bytes of state: v, w, x, y, and z.
+ */
+ x = Fetch64(s + len - 40);
+ y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
+ z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
+ WeakHashLen32WithSeeds(s + len - 64, len, z, &v);
+ WeakHashLen32WithSeeds(s + len - 32, y + k1, x, &w);
+ x = x * k1 + Fetch64(s);
+
+ /*
+ * Use len to count multiples of 64, and operate on 64-byte chunks.
+ */
+ for (len = (len - 1) >> 6; len != 0; len--) {
+ x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+ y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+ x ^= w.second;
+ y += v.first + Fetch64(s + 40);
+ z = Rotate(z + w.first, 33) * k1;
+ WeakHashLen32WithSeeds(s, v.second * k1, x + w.first, &v);
+ WeakHashLen32WithSeeds(
+ s + 32, z + w.second, y + Fetch64(s + 16), &w);
+ temp = z;
+ z = x;
+ x = temp;
+ s += 64;
+ }
+ return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+ HashLen16(v.second, w.second) + x);
+}
+
+/*
+ * __wt_hash_city64 --
+ * WiredTiger wrapper around third party hash implementation.
+ */
+uint64_t
+__wt_hash_city64(const void *s, size_t len)
+{
+ return (CityHash64(s, len));
+}
diff --git a/src/third_party/wiredtiger/src/support/hash_fnv.c b/src/third_party/wiredtiger/src/support/hash_fnv.c
new file mode 100644
index 00000000000..68f8537a4a0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hash_fnv.c
@@ -0,0 +1,161 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
+ *
+ * @(#) $Revision: 5.1 $
+ * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
+ * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
+ *
+ ***
+ *
+ * Fowler/Noll/Vo hash
+ *
+ * The basis of this hash algorithm was taken from an idea sent
+ * as reviewer comments to the IEEE POSIX P1003.2 committee by:
+ *
+ * Phong Vo (http://www.research.att.com/info/kpv/)
+ * Glenn Fowler (http://www.research.att.com/~gsf/)
+ *
+ * In a subsequent ballot round:
+ *
+ * Landon Curt Noll (http://www.isthe.com/chongo/)
+ *
+ * improved on their algorithm. Some people tried this hash
+ * and found that it worked rather well. In an EMail message
+ * to Landon, they named it the ``Fowler/Noll/Vo'' or FNV hash.
+ *
+ * FNV hashes are designed to be fast while maintaining a low
+ * collision rate. The FNV speed allows one to quickly hash lots
+ * of data while maintaining a reasonable collision rate. See:
+ *
+ * http://www.isthe.com/chongo/tech/comp/fnv/index.html
+ *
+ * for more details as well as other forms of the FNV hash.
+ *
+ ***
+ *
+ * To use the recommended 64 bit FNV-1a hash, pass FNV1A_64_INIT as the
+ * uint64_t hashval argument to fnv_64a_buf() or fnv_64a_str().
+ *
+ ***
+ *
+ * Please do not copyright this code. This code is in the public domain.
+ *
+ * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
+ * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ *
+ * By:
+ * chongo <Landon Curt Noll> /\oo/\
+ * http://www.isthe.com/chongo/
+ *
+ * Share and Enjoy! :-)
+ */
+
+#include <stdlib.h>
+#include "wt_internal.h"
+
+/*
+ * This file contains a 64 bit hash implementation of the FNV 1a 64 bit hash
+ * function. The implementation is from a third party.
+ *
+ * The code has been updated to remove unnecessary content and better comply
+ * with WiredTiger coding standards. The original source code can be found at:
+ * FNV 1a 64 bit: http://www.isthe.com/chongo/src/fnv/hash_64a.c
+ */
+
+/*
+ * 64 bit FNV-1 non-zero initial basis
+ *
+ * The FNV-1 initial basis is the FNV-0 hash of the following 32 octets:
+ *
+ * chongo <Landon Curt Noll> /\../\
+ *
+ * NOTE: The \'s above are not back-slashing escape characters.
+ * They are literal ASCII backslash 0x5c characters.
+ *
+ * NOTE: The FNV-1a initial basis is the same value as FNV-1 by definition.
+ */
+#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
+
+/*
+ * fnv_64a_buf --
+ * Perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
+ *
+ * input:
+ * buf - start of buffer to hash
+ * len - length of buffer in octets
+ * hval - previous hash value or 0 if first call
+ *
+ * returns:
+ * 64 bit hash as a static hash type
+ *
+ * NOTE: To use the recommended 64 bit FNV-1a hash, use FNV1A_64_INIT as the
+ * hval arg on the first call to either fnv_64a_buf() or fnv_64a_str().
+ */
+static inline uint64_t
+fnv_64a_buf(const void *buf, size_t len, uint64_t hval)
+{
+ const unsigned char *bp = buf; /* start of buffer */
+ const unsigned char *be = bp + len; /* beyond end of buffer */
+
+ /*
+ * FNV-1a hash each octet of the buffer
+ */
+ while (bp < be) {
+
+ /* xor the bottom with the current octet */
+ hval ^= (uint64_t)*bp++;
+
+ /*
+ * Multiply by the 64 bit FNV magic prime mod 2^64. The
+ * following shift operation is generally faster than
+ * a multiply operation.
+ */
+ hval += (hval << 1) + (hval << 4) + (hval << 5) +
+ (hval << 7) + (hval << 8) + (hval << 40);
+ }
+
+ /* return our new hash value */
+ return (hval);
+}
+
+/*
+ * __wt_hash_fnv64 --
+ * WiredTiger wrapper around third party hash implementation.
+ */
+uint64_t
+__wt_hash_fnv64(const void *string, size_t len)
+{
+ return (fnv_64a_buf(string, len, FNV1A_64_INIT));
+}
diff --git a/src/third_party/wiredtiger/src/support/hazard.c b/src/third_party/wiredtiger/src/support/hazard.c
new file mode 100644
index 00000000000..12350ab52f4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hazard.c
@@ -0,0 +1,244 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#ifdef HAVE_DIAGNOSTIC
+static void __hazard_dump(WT_SESSION_IMPL *);
+#endif
+
+/*
+ * __wt_hazard_set --
+ * Set a hazard pointer.
+ */
+int
+__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_BTREE *btree;
+ WT_HAZARD *hp;
+ int restarts = 0;
+
+ btree = S2BT(session);
+ *busyp = 0;
+
+ /* If a file can never be evicted, hazard pointers aren't required. */
+ if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
+ return (0);
+
+ /*
+ * Do the dance:
+ *
+ * The memory location which makes a page "real" is the WT_REF's state
+ * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
+ * page eviction server.
+ *
+ * Add the WT_REF reference to the session's hazard list and flush the
+ * write, then see if the page's state is still valid. If so, we can
+ * use the page because the page eviction server will see our hazard
+ * pointer before it discards the page (the eviction server sets the
+ * state to WT_REF_LOCKED, then flushes memory and checks the hazard
+ * pointers).
+ *
+ * For sessions with many active hazard pointers, skip most of the
+ * active slots: there may be a free slot in there, but checking is
+ * expensive. Most hazard pointers are released quickly: optimize
+ * for that case.
+ */
+ for (hp = session->hazard + session->nhazard;; ++hp) {
+ /* Expand the number of hazard pointers if available.*/
+ if (hp >= session->hazard + session->hazard_size) {
+ if (session->hazard_size >= S2C(session)->hazard_max)
+ break;
+ /* Restart the search. */
+ if (session->nhazard < session->hazard_size &&
+ restarts++ == 0) {
+ hp = session->hazard;
+ continue;
+ }
+ WT_PUBLISH(session->hazard_size,
+ WT_MIN(session->hazard_size + WT_HAZARD_INCR,
+ S2C(session)->hazard_max));
+ }
+
+ if (hp->page != NULL)
+ continue;
+
+ hp->page = ref->page;
+#ifdef HAVE_DIAGNOSTIC
+ hp->file = file;
+ hp->line = line;
+#endif
+ /* Publish the hazard pointer before reading page's state. */
+ WT_FULL_BARRIER();
+
+ /*
+ * Check if the page state is still valid, where valid means a
+ * state of WT_REF_MEM and the pointer is unchanged. (The
+ * pointer can change, it means the page was evicted between
+ * the time we set our hazard pointer and the publication. It
+ * would theoretically be possible for the page to be evicted
+ * and a different page read into the same memory, so the
+ * pointer hasn't changed but the contents have. That's OK, we
+ * found this page using the tree's key space, whatever page we
+ * find here is the page for us to use.)
+ */
+ if (ref->page == hp->page && ref->state == WT_REF_MEM) {
+ ++session->nhazard;
+ return (0);
+ }
+
+ /*
+ * The page isn't available, it's being considered for eviction
+ * (or being evicted, for all we know). If the eviction server
+ * sees our hazard pointer before evicting the page, it will
+ * return the page to use, no harm done, if it doesn't, it will
+ * go ahead and complete the eviction.
+ *
+ * We don't bother publishing this update: the worst case is we
+ * prevent some random page from being evicted.
+ */
+ hp->page = NULL;
+ *busyp = 1;
+ return (0);
+ }
+
+ __wt_errx(session, "session %p: hazard pointer table full", session);
+#ifdef HAVE_DIAGNOSTIC
+ __hazard_dump(session);
+#endif
+
+ return (ENOMEM);
+}
+
+/*
+ * __wt_hazard_clear --
+ * Clear a hazard pointer.
+ */
+int
+__wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_HAZARD *hp;
+
+ btree = S2BT(session);
+
+ /* If a file can never be evicted, hazard pointers aren't required. */
+ if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
+ return (0);
+
+ /*
+ * Clear the caller's hazard pointer.
+ * The common pattern is LIFO, so do a reverse search.
+ */
+ for (hp = session->hazard + session->hazard_size - 1;
+ hp >= session->hazard;
+ --hp)
+ if (hp->page == page) {
+ /*
+ * We don't publish the hazard pointer clear in the
+ * general case. It's not required for correctness;
+ * it gives an eviction thread faster access to the
+ * page were the page selected for eviction, but the
+ * generation number was just set, it's unlikely the
+ * page will be selected for eviction.
+ */
+ hp->page = NULL;
+
+ /*
+ * If this was the last hazard pointer in the session,
+ * we may need to update our transactional context.
+ */
+ --session->nhazard;
+ return (0);
+ }
+
+ /*
+ * A serious error, we should always find the hazard pointer. Panic,
+ * because using a page we didn't have pinned down implies corruption.
+ */
+ WT_PANIC_RET(session, EINVAL,
+ "session %p: clear hazard pointer: %p: not found", session, page);
+}
+
+/*
+ * __wt_hazard_close --
+ * Verify that no hazard pointers are set.
+ */
+void
+__wt_hazard_close(WT_SESSION_IMPL *session)
+{
+ WT_HAZARD *hp;
+ int found;
+
+ /*
+ * Check for a set hazard pointer and complain if we find one. We could
+ * just check the session's hazard pointer count, but this is a useful
+ * diagnostic.
+ */
+ for (found = 0, hp = session->hazard;
+ hp < session->hazard + session->hazard_size; ++hp)
+ if (hp->page != NULL) {
+ found = 1;
+ break;
+ }
+ if (session->nhazard == 0 && !found)
+ return;
+
+ __wt_errx(session,
+ "session %p: close hazard pointer table: table not empty", session);
+
+#ifdef HAVE_DIAGNOSTIC
+ __hazard_dump(session);
+#endif
+
+ /*
+ * Clear any hazard pointers because it's not a correctness problem
+ * (any hazard pointer we find can't be real because the session is
+ * being closed when we're called). We do this work because session
+ * close isn't that common that it's an expensive check, and we don't
+ * want to let a hazard pointer lie around, keeping a page from being
+ * evicted.
+ *
+ * We don't panic: this shouldn't be a correctness issue (at least, I
+ * can't think of a reason it would be).
+ */
+ for (hp = session->hazard;
+ hp < session->hazard + session->hazard_size; ++hp)
+ if (hp->page != NULL) {
+ hp->page = NULL;
+ --session->nhazard;
+ }
+
+ if (session->nhazard != 0)
+ __wt_errx(session,
+ "session %p: close hazard pointer table: count didn't "
+ "match entries",
+ session);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __hazard_dump --
+ * Display the list of hazard pointers.
+ */
+static void
+__hazard_dump(WT_SESSION_IMPL *session)
+{
+ WT_HAZARD *hp;
+
+ for (hp = session->hazard;
+ hp < session->hazard + session->hazard_size; ++hp)
+ if (hp->page != NULL)
+ __wt_errx(session,
+ "session %p: hazard pointer %p: %s, line %d",
+ session, hp->page, hp->file, hp->line);
+}
+#endif
diff --git a/src/third_party/wiredtiger/src/support/hex.c b/src/third_party/wiredtiger/src/support/hex.c
new file mode 100644
index 00000000000..9ee3e723fa2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/hex.c
@@ -0,0 +1,215 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+static const u_char hex[] = "0123456789abcdef";
+
+/*
+ * __fill_hex --
+ * In-memory conversion of raw bytes to a hexadecimal representation.
+ */
+static inline void
+__fill_hex(const uint8_t *src, size_t src_max,
+ uint8_t *dest, size_t dest_max, size_t *lenp)
+{
+ uint8_t *dest_orig;
+
+ dest_orig = dest;
+ if (dest_max > 0) /* save a byte for nul-termination */
+ --dest_max;
+ for (; src_max > 0 && dest_max > 1;
+ src_max -= 1, dest_max -= 2, ++src) {
+ *dest++ = hex[(*src & 0xf0) >> 4];
+ *dest++ = hex[*src & 0x0f];
+ }
+ *dest++ = '\0';
+ if (lenp != NULL)
+ *lenp = WT_PTRDIFF(dest, dest_orig);
+}
+
+/*
+ * __wt_raw_to_hex --
+ * Convert a chunk of data to a nul-terminated printable hex string.
+ */
+int
+__wt_raw_to_hex(
+ WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
+{
+ size_t len;
+
+ /*
+ * Every byte takes up 2 spaces, plus a trailing nul byte.
+ */
+ len = size * 2 + 1;
+ WT_RET(__wt_buf_init(session, to, len));
+
+ __fill_hex(from, size, to->mem, len, &to->size);
+ return (0);
+}
+
+/*
+ * __wt_raw_to_esc_hex --
+ * Convert a chunk of data to a nul-terminated printable string using
+ * escaped hex, as necessary.
+ */
+int
+__wt_raw_to_esc_hex(
+ WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to)
+{
+ size_t i;
+ const uint8_t *p;
+ u_char *t;
+
+ /*
+ * In the worst case, every character takes up 3 spaces, plus a
+ * trailing nul byte.
+ */
+ WT_RET(__wt_buf_init(session, to, size * 3 + 1));
+
+ /*
+ * In the worst case, every character takes up 3 spaces, plus a
+ * trailing nul byte.
+ */
+ for (p = from, t = to->mem, i = size; i > 0; --i, ++p)
+ if (isprint((int)*p)) {
+ if (*p == '\\')
+ *t++ = '\\';
+ *t++ = *p;
+ } else {
+ *t++ = '\\';
+ *t++ = hex[(*p & 0xf0) >> 4];
+ *t++ = hex[*p & 0x0f];
+ }
+ *t++ = '\0';
+ to->size = WT_PTRDIFF(t, to->mem);
+ return (0);
+}
+
+/*
+ * __wt_hex2byte --
+ * Convert a pair of hex characters into a byte.
+ */
+int
+__wt_hex2byte(const u_char *from, u_char *to)
+{
+ uint8_t byte;
+
+ switch (from[0]) {
+ case '0': byte = 0; break;
+ case '1': byte = 1 << 4; break;
+ case '2': byte = 2 << 4; break;
+ case '3': byte = 3 << 4; break;
+ case '4': byte = 4 << 4; break;
+ case '5': byte = 5 << 4; break;
+ case '6': byte = 6 << 4; break;
+ case '7': byte = 7 << 4; break;
+ case '8': byte = 8 << 4; break;
+ case '9': byte = 9 << 4; break;
+ case 'a': byte = 10 << 4; break;
+ case 'b': byte = 11 << 4; break;
+ case 'c': byte = 12 << 4; break;
+ case 'd': byte = 13 << 4; break;
+ case 'e': byte = 14 << 4; break;
+ case 'f': byte = 15 << 4; break;
+ default:
+ return (1);
+ }
+
+ switch (from[1]) {
+ case '0': break;
+ case '1': byte |= 1; break;
+ case '2': byte |= 2; break;
+ case '3': byte |= 3; break;
+ case '4': byte |= 4; break;
+ case '5': byte |= 5; break;
+ case '6': byte |= 6; break;
+ case '7': byte |= 7; break;
+ case '8': byte |= 8; break;
+ case '9': byte |= 9; break;
+ case 'a': byte |= 10; break;
+ case 'b': byte |= 11; break;
+ case 'c': byte |= 12; break;
+ case 'd': byte |= 13; break;
+ case 'e': byte |= 14; break;
+ case 'f': byte |= 15; break;
+ default:
+ return (1);
+ }
+ *to = byte;
+ return (0);
+}
+
+/*
+ * __hex_fmterr --
+ * Hex format error message.
+ */
+static int
+__hex_fmterr(WT_SESSION_IMPL *session)
+{
+ WT_RET_MSG(session, EINVAL, "Invalid format in hexadecimal string");
+}
+
+/*
+ * __wt_hex_to_raw --
+ * Convert a nul-terminated printable hex string to a chunk of data.
+ */
+int
+__wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
+{
+ return (__wt_nhex_to_raw(session, from, strlen(from), to));
+}
+
+/*
+ * __wt_nhex_to_raw --
+ * Convert a printable hex string to a chunk of data.
+ */
+int
+__wt_nhex_to_raw(
+ WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to)
+{
+ const u_char *p;
+ u_char *t;
+
+ if (size % 2 != 0)
+ return (__hex_fmterr(session));
+
+ WT_RET(__wt_buf_init(session, to, size / 2));
+
+ for (p = (u_char *)from, t = to->mem; size > 0; p += 2, size -= 2, ++t)
+ if (__wt_hex2byte(p, t))
+ return (__hex_fmterr(session));
+
+ to->size = WT_PTRDIFF(t, to->mem);
+ return (0);
+}
+
+/*
+ * __wt_esc_hex_to_raw --
+ * Convert a printable string, encoded in escaped hex, to a chunk of data.
+ */
+int
+__wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to)
+{
+ const u_char *p;
+ u_char *t;
+
+ WT_RET(__wt_buf_init(session, to, strlen(from)));
+
+ for (p = (u_char *)from, t = to->mem; *p != '\0'; ++p, ++t) {
+ if ((*t = *p) != '\\')
+ continue;
+ ++p;
+ if (p[0] != '\\') {
+ if (p[0] == '\0' || p[1] == '\0' || __wt_hex2byte(p, t))
+ return (__hex_fmterr(session));
+ ++p;
+ }
+ }
+ to->size = WT_PTRDIFF(t, to->mem);
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/support/huffman.c b/src/third_party/wiredtiger/src/support/huffman.c
new file mode 100644
index 00000000000..5a06b72d33e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/huffman.c
@@ -0,0 +1,899 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#define __HUFFMAN_DETAIL 0 /* Set to 1 for debugging output. */
+
+/* Length of header in compressed message, in bits. */
+#define WT_HUFFMAN_HEADER 3
+
+/*
+ * Maximum allowed length of Huffman code words, which otherwise can range up
+ * to (#symbols - 1) bits long. Lower value to use less memory for tables,
+ * higher value for better compression. Max value = 16 (or 32-7=25 or 64-7=57
+ * if adjust data types). FYI, JPEG uses 16. A side effect of limiting max
+ * code length is that the worst case compression (a message of the least
+ * frequent symbols) is shorter.
+ */
+#define MAX_CODE_LENGTH 16
+
+typedef struct __wt_freqtree_node {
+ /*
+ * Data structure representing a node of the huffman tree. It holds a
+ * 64-bit weight and pointers to the left and right child nodes. The
+ * node either has two child nodes or none.
+ */
+ uint8_t symbol; /* only used in leaf nodes */
+ uint64_t weight;
+ struct __wt_freqtree_node *left; /* bit 0 */
+ struct __wt_freqtree_node *right; /* bit 1 */
+} WT_FREQTREE_NODE;
+
+typedef struct __wt_huffman_code {
+ uint16_t pattern; /* requirement: length of field's type
+ * in bits >= MAX_CODE_LENGTH.
+ */
+ uint8_t length;
+} WT_HUFFMAN_CODE;
+
+typedef struct __wt_huffman_obj {
+ /*
+ * Data structure here defines specific instance of the encoder/decoder.
+ */
+ u_int numSymbols; /* Symbols: UINT16_MAX or UINT8_MAX */
+
+ uint16_t max_depth, min_depth; /* Tree max/min depths */
+
+ /*
+ * use: codes[symbol] = struct with pattern and length.
+ * Used in encoding and decoding.
+ * memory: codes[0-to-(number of symbols - 1)]
+ */
+ WT_HUFFMAN_CODE *codes;
+
+ /*
+ * use: code2symbol[Huffman_code] = symbol.
+ * Used in decoding.
+ * memory: code2symbol[1 << max_code_length]
+ */
+ uint8_t *code2symbol;
+} WT_HUFFMAN_OBJ;
+
+/*
+ * Queue element data structure.
+ *
+ * Consists of a pointer to a huffman tree node, and a pointer to the next
+ * element in the queue.
+ */
+typedef struct node_queue_elem {
+ WT_FREQTREE_NODE *node;
+ struct node_queue_elem *next;
+} NODE_QUEUE_ELEM;
+
+/*
+ * Queue of huffman tree nodes.
+ *
+ * Contains a pointer to the beginning and the end of the queue, which is
+ * implemented as a linked list.
+ */
+typedef struct node_queue {
+ NODE_QUEUE_ELEM *first;
+ NODE_QUEUE_ELEM *last;
+} NODE_QUEUE;
+
+/*
+ * Internal data structure used to preserve the symbol when rearranging the
+ * frequency array.
+ */
+typedef struct __indexed_byte {
+ uint32_t symbol; /* not uint8_t: match external data structure */
+ uint32_t frequency;
+} INDEXED_SYMBOL;
+
+static int indexed_freq_compare(const void *, const void *);
+static int indexed_symbol_compare(const void *, const void *);
+static void make_table(
+ WT_SESSION_IMPL *, uint8_t *, uint16_t, WT_HUFFMAN_CODE *, u_int);
+static void node_queue_close(WT_SESSION_IMPL *, NODE_QUEUE *);
+static void node_queue_dequeue(
+ WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE **);
+static int node_queue_enqueue(
+ WT_SESSION_IMPL *, NODE_QUEUE *, WT_FREQTREE_NODE *);
+static uint32_t profile_tree(
+ WT_FREQTREE_NODE *, uint16_t, uint16_t *, uint16_t *);
+static void recursive_free_node(WT_SESSION_IMPL *, WT_FREQTREE_NODE *);
+static void set_codes(WT_FREQTREE_NODE *, WT_HUFFMAN_CODE *, uint16_t, uint8_t);
+
+#define node_queue_is_empty(queue) \
+ ((queue) == NULL || (queue)->first == NULL)
+
+/*
+ * indexed_symbol_compare --
+ * Qsort comparator to order the table by symbol, lowest to highest.
+ */
+static int
+indexed_symbol_compare(const void *a, const void *b)
+{
+ return (((INDEXED_SYMBOL *)a)->symbol >
+ ((INDEXED_SYMBOL *)b)->symbol ? 1 :
+ (((INDEXED_SYMBOL *)a)->symbol <
+ ((INDEXED_SYMBOL *)b)->symbol ? -1 : 0));
+}
+
+/*
+ * indexed_freq_compare --
+ * Qsort comparator to order the table by frequency (the most frequent
+ * symbols will be at the end of the array).
+ */
+static int
+indexed_freq_compare(const void *a, const void *b)
+{
+ return (((INDEXED_SYMBOL *)a)->frequency >
+ ((INDEXED_SYMBOL *)b)->frequency ? 1 :
+ (((INDEXED_SYMBOL *)a)->frequency <
+ ((INDEXED_SYMBOL *)b)->frequency ? -1 : 0));
+}
+
+/*
+ * profile_tree --
+ * Traverses tree to determine #leaves under each node, max depth, min
+ * depth of leaf.
+ */
+static uint32_t
+profile_tree(WT_FREQTREE_NODE *node,
+ uint16_t len, uint16_t *max_depth, uint16_t *min_depth)
+{
+ uint32_t leaf_cnt;
+
+ if (node->left == NULL && node->right == NULL) { /* leaf */
+ leaf_cnt = 1;
+ if (*max_depth < len)
+ *max_depth = len;
+ if (*min_depth > len)
+ *min_depth = len;
+ } else {
+ /*
+ * internal node -- way tree constructed internal always has
+ * left and right children
+ */
+ leaf_cnt =
+ profile_tree(node->left, len + 1, max_depth, min_depth) +
+ profile_tree(node->right, len + 1, max_depth, min_depth);
+ }
+ node->weight = leaf_cnt; /* abuse weight field */
+ return (leaf_cnt);
+}
+
+/*
+ * set_codes --
+ * Computes Huffman code for each symbol in tree.
+ *
+ * Method is standard way in the literature, except that limits maximum code
+ * length. A known max code length is important for limiting memory use by
+ * the tables and for knowing how large data types need to be such as the field
+ * that holds the code pattern.
+ */
+static void
+set_codes(WT_FREQTREE_NODE *node,
+ WT_HUFFMAN_CODE *codes, uint16_t pattern, uint8_t len)
+{
+ WT_HUFFMAN_CODE *code;
+ uint16_t patternleft, patternright, half;
+ uint8_t remaining;
+
+ if (node->left == NULL && node->right == NULL) {
+ code = &codes[node->symbol];
+ code->pattern = pattern;
+ code->length = len;
+#if __HUFFMAN_DETAIL
+ printf("%" PRIx16 ": code %" PRIx16 ", len %" PRIu8 "\n",
+ node->symbol, pattern, len);
+#endif
+ } else {
+ /*
+ * Check each subtree individually to see if can afford to split
+ * up bits into possibly shorter codes, or if need to employ all
+ * remaining bits up to MAX_CODE_LENGTH to consecutively number
+ * leaves.
+ */
+ remaining = MAX_CODE_LENGTH - len;
+ /*
+ * If not already in "low-bit mode", but need to be, open up
+ * lower-order bits for consecutive numbering.
+ */
+ if (len < MAX_CODE_LENGTH &&
+ ((half = 1 << (remaining - 1)) < node->left->weight ||
+ half < node->right->weight)) {
+ pattern = pattern << remaining;
+ len = MAX_CODE_LENGTH;
+ }
+
+ if (len < MAX_CODE_LENGTH) {
+ patternleft = (pattern << 1) | 0;
+ patternright = (pattern << 1) | 1;
+ len++;
+ } else { /* "low bit mode" */
+ patternleft = pattern;
+ patternright = pattern + node->left->weight;
+ /* len unchanged */
+ }
+
+ set_codes(node->left, codes, patternleft, len);
+ set_codes(node->right, codes, patternright, len);
+ }
+}
+
+/*
+ * make_table --
+ * Computes Huffman table used for subsequent lookups in encoding and
+ * decoding. With the table, encoding from a symbol to Huffman code and
+ * decoding from a code to a symbol are simple array lookups.
+ */
+static void
+make_table(WT_SESSION_IMPL *session, uint8_t *code2symbol,
+ uint16_t max_depth, WT_HUFFMAN_CODE *codes, u_int symcnt)
+{
+ uint32_t j, c1, c2; /* Exceeds uint16_t bounds at loop boundary. */
+ uint16_t c, i;
+ uint8_t len, shift;
+
+ /* Zero out, for assertion below. */
+ for (j = 0, c2 = (1U << max_depth); j < c2; j++)
+ code2symbol[j] = 0;
+
+ /*
+ * Here's the magic: flood all bit patterns for lower-order bits to
+ * point to same symbol.
+ */
+ for (i = 0; i < symcnt; i++) {
+ if ((len = codes[i].length) == 0)
+ continue;
+
+ /*
+ * The size of the array index should be enough to hold largest
+ * index into symbol table. Pre-existing symbols were packed
+ * 0-255, so 8 bits is enough. Don't want to make it larger
+ * than necessary, we allocate (2 ^ max-code-length) of them.
+ */
+ c = codes[i].pattern;
+ shift = max_depth - len;
+ c1 = (uint32_t)c << shift;
+ c2 = (uint32_t)(c + 1) << shift;
+ for (j = c1; j < c2; j++) {
+ WT_ASSERT(session, code2symbol[j] == 0);
+ code2symbol[j] = i;
+ }
+ }
+}
+
+/*
+ * recursive_free_node --
+ * Recursively free the huffman frequency tree's nodes.
+ */
+static void
+recursive_free_node(WT_SESSION_IMPL *session, WT_FREQTREE_NODE *node)
+{
+ if (node != NULL) {
+ recursive_free_node(session, node->left);
+ recursive_free_node(session, node->right);
+ __wt_free(session, node);
+ }
+}
+
+/*
+ * __wt_huffman_open --
+ * Take a frequency table and return a pointer to a descriptor object.
+ */
+int
+__wt_huffman_open(WT_SESSION_IMPL *session,
+ void *symbol_frequency_array, u_int symcnt, u_int numbytes, void *retp)
+{
+ INDEXED_SYMBOL *indexed_freqs, *sym;
+ NODE_QUEUE *combined_nodes, *leaves;
+ WT_DECL_RET;
+ WT_FREQTREE_NODE *node, *node2, **refnode, *tempnode;
+ WT_HUFFMAN_OBJ *huffman;
+ uint64_t w1, w2;
+ uint16_t i;
+
+ indexed_freqs = symbol_frequency_array;
+
+ combined_nodes = leaves = NULL;
+ node = node2 = tempnode = NULL;
+
+ WT_RET(__wt_calloc_def(session, 1, &huffman));
+
+ /*
+ * The frequency table is 4B pairs of symbol and frequency. The symbol
+ * is either 1 or 2 bytes and the frequency ranges from 1 to UINT32_MAX
+ * (a frequency of 0 means the value is never expected to appear in the
+ * input). Validate the symbols are within range.
+ */
+ if (numbytes != 1 && numbytes != 2)
+ WT_ERR_MSG(session, EINVAL,
+ "illegal number of symbol bytes specified for a huffman "
+ "table");
+
+ if (symcnt == 0)
+ WT_ERR_MSG(session, EINVAL,
+ "illegal number of symbols specified for a huffman table");
+
+ huffman->numSymbols = numbytes == 2 ? UINT16_MAX : UINT8_MAX;
+
+ /*
+ * Order the array by symbol and check for invalid symbols and
+ * duplicates.
+ */
+ qsort((void *)indexed_freqs,
+ symcnt, sizeof(INDEXED_SYMBOL), indexed_symbol_compare);
+ for (i = 0; i < symcnt; ++i) {
+ if (i > 0 &&
+ indexed_freqs[i].symbol == indexed_freqs[i - 1].symbol)
+ WT_ERR_MSG(session, EINVAL,
+ "duplicate symbol %" PRIx32
+ " specified in a huffman table",
+ indexed_freqs[i].symbol);
+ if (indexed_freqs[i].symbol > huffman->numSymbols)
+ WT_ERR_MSG(session, EINVAL,
+ "illegal symbol %" PRIx32
+ " specified in a huffman table",
+ indexed_freqs[i].symbol);
+ }
+
+ /*
+ * Massage frequencies.
+ */
+ indexed_freqs = NULL;
+ WT_ERR(__wt_calloc_def(session, 256, &indexed_freqs));
+
+ /*
+ * Minimum of frequency==1 so everybody gets a Huffman code, in case
+ * data evolves and we need to represent this value.
+ */
+ for (i = 0; i < 256; i++) {
+ sym = &indexed_freqs[i];
+ sym->symbol = i;
+ sym->frequency = 1;
+ }
+ /*
+ * Avoid large tables by splitting UTF-16 frequencies into high byte
+ * and low byte.
+ */
+ for (i = 0; i < symcnt; i++) {
+ sym = &((INDEXED_SYMBOL *)symbol_frequency_array)[i];
+ indexed_freqs[sym->symbol & 0xff].frequency += sym->frequency;
+ if (numbytes == 2)
+ indexed_freqs[(sym->symbol >> 8) & 0xff].frequency +=
+ sym->frequency;
+ }
+ huffman->numSymbols = symcnt = 256;
+
+ /*
+ * The array must be sorted by frequency to be able to use a linear time
+ * construction algorithm.
+ */
+ qsort((void *)indexed_freqs,
+ symcnt, sizeof(INDEXED_SYMBOL), indexed_freq_compare);
+
+ /* We need two node queues to build the tree. */
+ WT_ERR(__wt_calloc_def(session, 1, &leaves));
+ WT_ERR(__wt_calloc_def(session, 1, &combined_nodes));
+
+ /*
+ * Adding the leaves to the queue.
+ *
+ * Discard symbols with a frequency of 0; this assumes these symbols
+ * never occur in the source stream, and the purpose is to reduce the
+ * huffman tree's size.
+ */
+ for (i = 0; i < symcnt; ++i)
+ if (indexed_freqs[i].frequency > 0) {
+ WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+ tempnode->symbol = (uint8_t)indexed_freqs[i].symbol;
+ tempnode->weight = indexed_freqs[i].frequency;
+ WT_ERR(node_queue_enqueue(session, leaves, tempnode));
+ tempnode = NULL;
+ }
+
+ while (!node_queue_is_empty(leaves) ||
+ !node_queue_is_empty(combined_nodes)) {
+ /*
+ * We have to get the node with the smaller weight, examining
+ * both queues' first element. We are collecting pairs of these
+ * items, by alternating between node and node2:
+ */
+ refnode = !node ? &node : &node2;
+
+ /*
+ * To decide which queue must be used, we get the weights of
+ * the first items from both:
+ */
+ w1 = node_queue_is_empty(leaves) ?
+ UINT64_MAX : leaves->first->node->weight;
+ w2 = node_queue_is_empty(combined_nodes) ?
+ UINT64_MAX : combined_nodes->first->node->weight;
+
+ /*
+ * Based on the two weights we finally can dequeue the smaller
+ * element and place it to the alternating target node pointer:
+ */
+ if (w1 < w2)
+ node_queue_dequeue(session, leaves, refnode);
+ else
+ node_queue_dequeue(session, combined_nodes, refnode);
+
+ /*
+ * In every second run, we have both node and node2 initialized.
+ */
+ if (node != NULL && node2 != NULL) {
+ WT_ERR(__wt_calloc_def(session, 1, &tempnode));
+
+ /* The new weight is the sum of the two weights. */
+ tempnode->weight = node->weight + node2->weight;
+ tempnode->left = node;
+ tempnode->right = node2;
+
+ /* Enqueue it to the combined nodes queue */
+ WT_ERR(node_queue_enqueue(
+ session, combined_nodes, tempnode));
+ tempnode = NULL;
+
+ /* Reset the state pointers */
+ node = node2 = NULL;
+ }
+ }
+
+ /*
+ * The remaining node is in the node variable, this is the root of the
+ * tree. Calculate how many bytes it takes to hold numSymbols bytes
+ * bits.
+ */
+ huffman->max_depth = 0;
+ huffman->min_depth = MAX_CODE_LENGTH;
+ (void)profile_tree(node, 0, &huffman->max_depth, &huffman->min_depth);
+ if (huffman->max_depth > MAX_CODE_LENGTH)
+ huffman->max_depth = MAX_CODE_LENGTH;
+
+ WT_ERR(__wt_calloc_def(session, huffman->numSymbols, &huffman->codes));
+ set_codes(node, huffman->codes, 0, 0);
+
+ WT_ERR(__wt_calloc_def(
+ session, 1U << huffman->max_depth, &huffman->code2symbol));
+ make_table(session, huffman->code2symbol,
+ huffman->max_depth, huffman->codes, huffman->numSymbols);
+
+#if __HUFFMAN_DETAIL
+ {
+ uint8_t symbol;
+ uint32_t weighted_length;
+
+ printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: "
+ "codes %u# * %uB + code2symbol %u# * %uB\n",
+ huffman->min_depth, huffman->max_depth,
+ huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE),
+ 1U << huffman->max_depth, (u_int)sizeof(uint16_t));
+
+ /*
+ * measure quality of computed Huffman codes, for different max bit
+ * lengths (say, 16 vs 24 vs 32)
+ */
+ weighted_length = 0;
+ for (i = 0; i < symcnt; i++) {
+ symbol = indexed_freqs[i].symbol;
+ weighted_length +=
+ indexed_freqs[i].frequency * huffman->codes[symbol].length;
+ printf(
+ "\t%" PRIu16 "->%" PRIu16 ". %" PRIu32 " * %" PRIu8 "\n",
+ i, symbol,
+ indexed_freqs[i].frequency, huffman->codes[symbol].length);
+ }
+ printf("weighted length of all codes (the smaller the better): "
+ "%" PRIu32 "\n", weighted_length);
+ }
+#endif
+
+ *(void **)retp = huffman;
+
+ if (0) {
+err: if (ret == 0)
+ ret = WT_ERROR;
+ }
+ __wt_free(session, indexed_freqs);
+ if (leaves != NULL)
+ node_queue_close(session, leaves);
+ if (combined_nodes != NULL)
+ node_queue_close(session, combined_nodes);
+ if (node != NULL)
+ recursive_free_node(session, node);
+ if (node2 != NULL)
+ recursive_free_node(session, node2);
+ __wt_free(session, tempnode);
+ if (ret != 0)
+ __wt_huffman_close(session, huffman);
+ return (ret);
+}
+
+/*
+ * __wt_huffman_close --
+ * Discard a Huffman descriptor object.
+ */
+void
+__wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg)
+{
+ WT_HUFFMAN_OBJ *huffman;
+
+ huffman = huffman_arg;
+
+ __wt_free(session, huffman->code2symbol);
+ __wt_free(session, huffman->codes);
+ __wt_free(session, huffman);
+}
+
+#if __HUFFMAN_DETAIL
+/*
+ * __wt_print_huffman_code --
+ * Prints a symbol's Huffman code.
+ */
+int
+__wt_print_huffman_code(void *huffman_arg, uint16_t symbol)
+{
+ WT_HUFFMAN_CODE code;
+ WT_HUFFMAN_OBJ *huffman;
+
+ huffman = huffman_arg;
+
+ if (symbol >= huffman->numSymbols)
+ printf("symbol %" PRIu16 " out of range\n", symbol);
+ else {
+ code = huffman->codes[symbol];
+ if (code.length == 0)
+ printf(
+ "symbol %" PRIu16 " not defined -- 0 frequency\n",
+ symbol);
+ else
+ /* should print code as binary */
+ printf(
+ "%" PRIu16 " -> code pattern "
+ "%" PRIx16 ", length %" PRIu8 "\n",
+ symbol, code.pattern, code.length);
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * __wt_huffman_encode --
+ * Take a byte string, encode it into the target.
+ *
+ * Translation from symbol to Huffman code is a simple array lookup.
+ *
+ * WT_HUFFMAN_OBJ contains an array called 'codes' with one WT_HUFFMAN_CODE per
+ * symbol. Then, given a symbol:
+ * pattern = codes[symbol].pattern;
+ * length = codes[symbol].length;
+ *
+ * To encode byte-string, we iterate over the input symbols. For each symbol,
+ * look it up via table, shift bits onto a shift register (an int long enough
+ * to hold the longest code word + up to 7 bits remaining from the previous),
+ * then drain out full bytes. Finally, at the end flush remaining bits
+ * and write header bits.
+ */
+int
+__wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg,
+ const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
+{
+ WT_DECL_RET;
+ WT_HUFFMAN_CODE code;
+ WT_HUFFMAN_OBJ *huffman;
+ WT_ITEM *tmp;
+ size_t max_len, outlen, bytes;
+ uint64_t bitpos;
+ const uint8_t *from;
+ uint8_t len, *out, padding_info, symbol;
+
+ /*
+ * Shift register to accumulate bits from input.
+ * Should be >= (MAX_CODE_LENGTH + 7), but also efficient to shift bits
+ * and preferably in a machine register.
+ */
+ uint32_t bits;
+
+ /* Count of bits in shift register ('bits' above). */
+ uint8_t valid;
+
+ huffman = huffman_arg;
+ from = from_arg;
+ tmp = NULL;
+
+ /*
+ * We don't want to find all of our callers and ensure they don't pass
+ * 0-length byte strings, but there's no reason to do any work.
+ */
+ if (from_len == 0) {
+ to_buf->size = 0;
+ return (0);
+ }
+
+ /*
+ * Compute the largest compressed output size, which is if all symbols
+ * are least frequent and so have largest Huffman codes, and compressed
+ * output may be larger than the input size. This way we don't have to
+ * worry about resizing the buffer during compression. Use the shared
+ * system buffer while compressing, then allocate a new buffer of the
+ * right size and copy the result into it.
+ */
+ max_len = (WT_HUFFMAN_HEADER +
+ from_len * huffman->max_depth + 7 /* round up to full byte */) / 8;
+ WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+
+ /*
+ * Leave the first 3 bits of the encoded value empty, it holds the
+ * number of bits actually used in the last byte of the encoded value.
+ */
+ bits = 0;
+ bitpos = WT_HUFFMAN_HEADER;
+ valid = WT_HUFFMAN_HEADER;
+ out = tmp->mem;
+ for (bytes = 0; bytes < from_len; bytes++) {
+ WT_ASSERT(session, WT_PTR_IN_RANGE(from, from_arg, from_len));
+
+ symbol = *from++;
+
+ /* Translate symbol into Huffman code and stuff into buffer. */
+ code = huffman->codes[symbol];
+ len = code.length;
+ bits = (bits << len) | code.pattern;
+ valid += len;
+ bitpos += len;
+ while (valid >= 8) {
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
+ *out++ = (uint8_t)(bits >> (valid - 8));
+ valid -= 8;
+ }
+ }
+ if (valid > 0) { /* Flush shift register. */
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(out, tmp->mem, tmp->memsize));
+ *out = (uint8_t)(bits << (8 - valid));
+ }
+
+ /*
+ * At this point, bitpos is the total number of used bits (including
+ * the 3 bits at the beginning of the buffer, which we'll set now to
+ * the number of bits used in the last byte). Note if the number of
+ * bits used in the last byte is 8, we set the 3 bits to 0, in other
+ * words, the first 3 bits of the encoded value are the number of bits
+ * used in the last byte, unless they're 0, in which case there are 8
+ * bits used in the last byte.
+ */
+ padding_info = (bitpos % 8) << (8 - WT_HUFFMAN_HEADER);
+ ((uint8_t *)tmp->mem)[0] |= padding_info;
+
+ /* Copy result of exact known size into caller's buffer. */
+ outlen = (uint32_t)((bitpos + 7) / 8);
+ WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
+ memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+ printf("encode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
+ max_len, outlen);
+#endif
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+
+}
+
+/*
+ * __wt_huffman_decode --
+ * Take a byte string, decode it into the target.
+ *
+ * Translation from Huffman code to symbol is a simple array lookup.
+ *
+ * WT_HUFFMAN_OBJ contains an array called 'code2symbol' indexed by code word
+ * and whose value is the corresponding symbol.
+ * From the symbol, we index into the 'codes' array to get the code length.
+ *
+ * When decoding a message, we don't know where the boundaries are between
+ * codes. The trick is that we collect enough bits for the longest code word,
+ * and construct the table such that for codes with fewer bits we flood the
+ * table with all of the bit patterns in the lower order bits. This works
+ * because the Huffman code is a unique prefix, and by the flooding we are
+ * treating bits beyond the unique prefix as don't care bits.
+ *
+ * For example, we have table of length 2^max_code_length (1<<max_code_length).
+ * For a code of length, max_code_length, the position code2symbol[code] =
+ * symbol.
+ * For a code word of (max_length - 1), we fill code2symbol[code << 1] = symbol,
+ * as well as code2symbol[(code << 1) | 1] = symbol.
+ * And so on, so in general we fill:
+ * code2symbol[(code) << shift inclusive .. (code+1) << shift exclusive].
+ *
+ * To decode a message, we read in enough bits from input to fill the shift
+ * register with at least MAX_CODE_LENGTH bits.
+ * We look up in the table code2symbol to obtain the symbol.
+ * We look up the symbol in 'codes' to obtain the code length
+ * Finally, subtract off these bits from the shift register.
+ */
+int
+__wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg,
+ const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf)
+{
+ WT_DECL_RET;
+ WT_ITEM *tmp;
+ WT_HUFFMAN_OBJ *huffman;
+ size_t from_bytes, len, max_len, outlen;
+ uint64_t from_len_bits;
+ uint32_t bits, mask, max;
+ uint16_t pattern;
+ const uint8_t *from;
+ uint8_t padding_info, symbol, *to, valid;
+
+ huffman = huffman_arg;
+ from = from_arg;
+ tmp = NULL;
+
+ /*
+ * We don't want to find all of our callers and ensure they don't pass
+ * 0-length byte strings, but there's no reason to do any work.
+ */
+ if (from_len == 0) {
+ to_buf->size = 0;
+ return (0);
+ }
+
+ /*
+ * The first 3 bits are the number of used bits in the last byte, unless
+ * they're 0, in which case there are 8 bits used in the last byte.
+ */
+ padding_info = (*from & 0xE0) >> (8 - WT_HUFFMAN_HEADER);
+ from_len_bits = from_len * 8;
+ if (padding_info != 0)
+ from_len_bits -= 8U - padding_info;
+
+ /* Number of bits that have codes. */
+ from_len_bits -= WT_HUFFMAN_HEADER;
+
+ /*
+ * Compute largest uncompressed output size, which is if all symbols are
+ * most frequent and so have smallest Huffman codes and therefore
+ * largest expansion. Use the shared system buffer while uncompressing,
+ * then allocate a new buffer of exactly the right size and copy the
+ * result into it.
+ */
+ max_len = (uint32_t)(from_len_bits / huffman->min_depth);
+ WT_ERR(__wt_scr_alloc(session, max_len, &tmp));
+ to = tmp->mem;
+
+ /* The first byte of input is a special case because of header bits. */
+ bits = *from++;
+ valid = 8 - WT_HUFFMAN_HEADER;
+ from_bytes = from_len - 1;
+
+ max = huffman->max_depth;
+ mask = (1U << max) - 1;
+ for (outlen = 0; from_len_bits > 0; outlen++) {
+ while (valid < max && from_bytes > 0) {
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(from, from_arg, from_len));
+ bits = (bits << 8) | *from++;
+ valid += 8;
+ from_bytes--;
+ }
+ pattern = valid >= max ? /* short patterns near end */
+ (bits >> (valid - max)) : (bits << (max - valid));
+ symbol = huffman->code2symbol[pattern & mask];
+ len = huffman->codes[symbol].length;
+ valid -= len;
+ WT_ASSERT(session, from_len_bits >= len);
+ from_len_bits -= len;
+
+ WT_ASSERT(session,
+ WT_PTR_IN_RANGE(to, tmp->mem, tmp->memsize));
+ *to++ = symbol;
+ }
+
+ /* Return the number of bytes used. */
+ WT_ERR(__wt_buf_initsize(session, to_buf, outlen));
+ memcpy(to_buf->mem, tmp->mem, outlen);
+
+#if __HUFFMAN_DETAIL
+ printf("decode: worst case %" PRIu32 " bytes -> actual %" PRIu32 "\n",
+ max_len, outlen);
+#endif
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * node_queue_close --
+ * Delete a queue from memory.
+ *
+ * It does not delete the pointed huffman tree nodes!
+ */
+static void
+node_queue_close(WT_SESSION_IMPL *session, NODE_QUEUE *queue)
+{
+ NODE_QUEUE_ELEM *elem, *next_elem;
+
+ /* Freeing each element of the queue's linked list. */
+ for (elem = queue->first; elem != NULL; elem = next_elem) {
+ next_elem = elem->next;
+ __wt_free(session, elem);
+ }
+
+ /* Freeing the queue record itself. */
+ __wt_free(session, queue);
+}
+
+/*
+ * node_queue_enqueue --
+ * Push a tree node to the end of the queue.
+ */
+static int
+node_queue_enqueue(
+ WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE *node)
+{
+ NODE_QUEUE_ELEM *elem;
+
+ /* Allocating a new linked list element */
+ WT_RET(__wt_calloc_def(session, 1, &elem));
+
+ /* It holds the tree node, and has no next element yet */
+ elem->node = node;
+ elem->next = NULL;
+
+ /* If the queue is empty, the first element will be the new one. */
+ if (queue->first == NULL)
+ queue->first = elem;
+
+ /*
+ * If the queue is not empty, the last element's next pointer must be
+ * updated.
+ */
+ if (queue->last != NULL)
+ queue->last->next = elem;
+
+ /* The last element is the new one */
+ queue->last = elem;
+
+ return (0);
+}
+
+/*
+ * node_queue_dequeue --
+ * Removes a node from the beginning of the queue and copies the node's
+ * pointer to the location referred by the retp parameter.
+ */
+static void
+node_queue_dequeue(
+ WT_SESSION_IMPL *session, NODE_QUEUE *queue, WT_FREQTREE_NODE **retp)
+{
+ NODE_QUEUE_ELEM *first_elem;
+
+ /*
+ * Getting the first element of the queue and updating it to point to
+ * the next element as first.
+ */
+ first_elem = queue->first;
+ *retp = first_elem->node;
+ queue->first = first_elem->next;
+
+ /*
+ * If the last element was the dequeued element, we have to update it
+ * to NULL.
+ */
+ if (queue->last == first_elem)
+ queue->last = NULL;
+
+ /* Freeing the linked list element that has been dequeued */
+ __wt_free(session, first_elem);
+}
diff --git a/src/third_party/wiredtiger/src/support/mutex.c b/src/third_party/wiredtiger/src/support/mutex.c
new file mode 100644
index 00000000000..ffe52cf28fd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/mutex.c
@@ -0,0 +1,257 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+#if SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_LOGGING
+
+/*
+ * __wt_spin_lock_register_lock --
+ * Add a lock to the connection's list.
+ */
+int
+__wt_spin_lock_register_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_CONNECTION_IMPL *conn;
+ u_int i;
+
+ /*
+ * There is a spinlock we initialize before we have a connection, the
+ * global library lock. In that case, the session will be NULL and
+ * we can't track the lock.
+ */
+ if (session == NULL)
+ return (0);
+
+ conn = S2C(session);
+
+ for (i = 0; i < WT_SPINLOCK_MAX; i++)
+ if (conn->spinlock_list[i] == NULL &&
+ WT_ATOMIC_CAS(conn->spinlock_list[i], NULL, t))
+ return (0);
+
+ WT_RET_MSG(session, ENOMEM,
+ "spinlock connection registry failed, increase the connection's "
+ "spinlock list size");
+}
+
+/*
+ * __wt_spin_lock_unregister_lock --
+ * Remove a lock from the connection's list.
+ */
+void
+__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
+{
+ WT_CONNECTION_IMPL *conn;
+ u_int i;
+
+ conn = S2C(session);
+
+ for (i = 0; i < WT_SPINLOCK_MAX; i++)
+ if (conn->spinlock_list[i] == t)
+ conn->spinlock_list[i] = NULL;
+
+ /*
+ * XXX
+ * The statistics thread reads through this array, there's a possible
+ * race: if that thread reads the pointer then goes to sleep, then we
+ * free the spinlock, then the statistics thread wakes up, it can read
+ * free'd memory.
+ *
+ * This is performance debugging code, so we're not fixing the race for
+ * now, minimize the window.
+ */
+ WT_FULL_BARRIER();
+}
+
+/*
+ * __spin_lock_next_id --
+ * Return the next spinlock caller ID.
+ */
+static int
+__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp)
+{
+ static int lock_id = 0, next_id = 0;
+ WT_DECL_RET;
+
+ /* If we've ever registered this location, we already have an ID. */
+ if (*idp != WT_SPINLOCK_REGISTER)
+ return (0);
+
+ /*
+ * We can't use the global spinlock to lock the ID allocation (duh!),
+ * use a CAS instruction to serialize access to a local variable.
+ * This work only gets done once per library instantiation, there
+ * isn't a performance concern.
+ */
+ while (!WT_ATOMIC_CAS(lock_id, 0, 1))
+ __wt_yield();
+
+ /* Allocate a blocking ID for this location. */
+ if (*idp == WT_SPINLOCK_REGISTER) {
+ if (next_id < WT_SPINLOCK_MAX_LOCATION_ID)
+ *idp = next_id++;
+ else
+ WT_ERR_MSG(session, ENOMEM,
+ "spinlock caller location registry failed, "
+ "increase the connection's blocking matrix size");
+ }
+
+err: WT_PUBLISH(lock_id, 0);
+ return (ret);
+}
+
+/*
+ * __wt_spin_lock_register_caller --
+ * Register a spin-lock caller's location information in the blocking
+ * matrix.
+ */
+int
+__wt_spin_lock_register_caller(WT_SESSION_IMPL *session,
+ const char *name, const char *file, int line, int *idp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS_SPINLOCK *p;
+
+ conn = S2C(session);
+
+ /*
+ * The caller's location ID is a static offset into a per-connection
+ * structure, and that has problems: first, if there are multiple
+ * connections, we'll need to hold some kind of lock to avoid racing
+ * when setting that value, and second, if/when there are multiple
+ * connections and/or a single connection is closed and re-opened, the
+ * variable may be initialized and underlying connection information
+ * may not.
+ *
+ * First, allocate a location ID if needed.
+ */
+ WT_RET(__spin_lock_next_id(session, idp));
+
+ /*
+ * Add the caller's information to the blocking matrix. We could race
+ * here (if two threads of control register the same lock at the same
+ * time), but we don't care as both threads are setting the identical
+ * information.
+ */
+ p = &conn->spinlock_block[*idp];
+ p->name = name;
+ if ((p->file = strrchr(file, '/')) == NULL)
+ p->file = file;
+ else
+ ++p->file;
+ p->line = line;
+ return (0);
+}
+
+/*
+ * __wt_statlog_dump_spinlock --
+ * Log the spin-lock statistics.
+ */
+int
+__wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag)
+{
+ WT_SPINLOCK *spin;
+ WT_CONNECTION_STATS_SPINLOCK *p, *t;
+ uint64_t block_manager, btree_page, ignore;
+ u_int i, j;
+
+ /*
+ * Ignore rare acquisition of a spinlock using a base value of 10 per
+ * second so we don't create graphs we don't care about.
+ */
+ ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10;
+
+ /* Output the number of times each spinlock was acquired. */
+ block_manager = btree_page = 0;
+ for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) {
+ if ((spin = conn->spinlock_list[i]) == NULL)
+ continue;
+
+ /*
+ * There are two sets of spinlocks we aggregate, the btree page
+ * locks and the block manager per-file locks. The reason is
+ * the block manager locks grow with the number of files open
+ * (and LSM and bloom filters can open a lot of files), and
+ * there are 16 btree page locks and splitting them out has not
+ * historically been that informative.
+ */
+ if (strcmp(spin->name, "block manager") == 0) {
+ block_manager += spin->counter;
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ spin->counter = 0;
+ continue;
+ }
+ if (strcmp(spin->name, "btree page") == 0) {
+ btree_page += spin->counter;
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ spin->counter = 0;
+ continue;
+ }
+
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+ conn->stat_stamp,
+ spin->counter <= ignore ? 0 : spin->counter,
+ tag, spin->name) < 0),
+ __wt_errno());
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ spin->counter = 0;
+ }
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+ conn->stat_stamp,
+ block_manager <= ignore ? 0 : block_manager,
+ tag, "block manager") < 0),
+ __wt_errno());
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %" PRIu64 " %s spinlock %s: acquisitions\n",
+ conn->stat_stamp,
+ btree_page <= ignore ? 0 : btree_page,
+ tag, "btree page") < 0),
+ __wt_errno());
+
+ /*
+ * Output the number of times each location acquires its spinlock and
+ * the blocking matrix.
+ */
+ for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) {
+ p = &conn->spinlock_block[i];
+ if (p->name == NULL)
+ continue;
+
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %d %s spinlock %s acquired by %s(%d)\n",
+ conn->stat_stamp,
+ p->total <= ignore ? 0 : p->total,
+ tag,
+ p->name, p->file, p->line) < 0), __wt_errno());
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ p->total = 0;
+
+ for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) {
+ t = &conn->spinlock_block[j];
+ if (t->name == NULL)
+ continue;
+
+ WT_RET_TEST((fprintf(conn->stat_fp,
+ "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n",
+ conn->stat_stamp,
+ p->blocked[j] <= ignore ? 0 : p->blocked[j],
+ tag,
+ p->name, p->file, p->line,
+ t->file, t->line) < 0), __wt_errno());
+ if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_CLEAR))
+ p->blocked[j] = 0;
+ }
+ }
+
+ WT_FULL_BARRIER(); /* Minimize the window. */
+ return (0);
+}
+
+#endif /* SPINLOCK_PTHREAD_MUTEX_LOGGING */
diff --git a/src/third_party/wiredtiger/src/support/pow.c b/src/third_party/wiredtiger/src/support/pow.c
new file mode 100644
index 00000000000..a6bf6c7227f
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/pow.c
@@ -0,0 +1,130 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+#ifdef __WIREDTIGER_UNUSED__
+
+/*
+ * __wt_nlpo2_round --
+ * Round up to the next-largest power-of-two for a 32-bit unsigned value.
+ *
+ * In 12 operations, this code computes the next highest power of 2 for a 32-bit
+ * integer. The result may be expressed by the formula 1U << (lg(v - 1) + 1).
+ * Note that in the edge case where v is 0, it returns 0, which isn't a power of
+ * 2; you might append the expression v += (v == 0) to remedy this if it
+ * matters. It would be faster by 2 operations to use the formula and the
+ * log base 2 method that uses a lookup table, but in some situations, lookup
+ * tables are not suitable, so the above code may be best. (On a Athlon XP 2100+
+ * I've found the above shift-left and then OR code is as fast as using a single
+ * BSR assembly language instruction, which scans in reverse to find the highest
+ * set bit.) It works by copying the highest set bit to all of the lower bits,
+ * and then adding one, which results in carries that set all of the lower bits
+ * to 0 and one bit beyond the highest set bit to 1. If the original number was
+ * a power of 2, then the decrement will reduce it to one less, so that we round
+ * up to the same original value. Devised by Sean Anderson, September 14, 2001.
+ * Pete Hart pointed me to a couple newsgroup posts by him and William Lewis in
+ * February of 1997, where they arrive at the same algorithm.
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ * Sean Eron Anderson, seander@cs.stanford.edu
+ */
+uint32_t
+__wt_nlpo2_round(uint32_t v)
+{
+ v--; /* If v is a power-of-two, return it. */
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return (v + 1);
+}
+
+/*
+ * __wt_nlpo2 --
+ * Return the next largest power-of-two.
+ */
+uint32_t
+__wt_nlpo2(uint32_t v)
+{
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ return (v + 1);
+}
+#endif /* __WIREDTIGER_UNUSED__ */
+
+/*
+ * __wt_log2_int --
+ * Find the log base 2 of an integer in O(N) operations;
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ */
+uint32_t
+__wt_log2_int(uint32_t n)
+{
+ uint32_t l = 0;
+
+ while (n >>= 1)
+ l++;
+ return (l);
+}
+
+/*
+ * __wt_ispo2 --
+ * Return if a number is a power-of-two.
+ */
+int
+__wt_ispo2(uint32_t v)
+{
+ /*
+ * Only numbers that are powers of two will satisfy the relationship
+ * (v & (v - 1) == 0).
+ *
+ * However n must be positive, this returns 0 as a power of 2; to fix
+ * that, use: (! (v & (v - 1)) && v)
+ */
+ return ((v & (v - 1)) == 0);
+}
+
+/*
+ * __wt_rduppo2 --
+ * Round the given int up to the next multiple of N, where N is power of 2.
+ */
+uint32_t
+__wt_rduppo2(uint32_t n, uint32_t po2)
+{
+ uint32_t bits, res;
+
+ if (__wt_ispo2(po2)) {
+ bits = __wt_log2_int(po2);
+ res = (((n - 1) >> bits) + 1) << bits;
+ } else
+ res = 0;
+ return (res);
+}
diff --git a/src/third_party/wiredtiger/src/support/rand.c b/src/third_party/wiredtiger/src/support/rand.c
new file mode 100644
index 00000000000..b716eb8c58b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/rand.c
@@ -0,0 +1,69 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "wt_internal.h"
+
+#undef M_W
+#define M_W (rnd)[0]
+#undef M_Z
+#define M_Z (rnd)[1]
+
+/*
+ * __wt_random_init --
+ * Initialize return of a 32-bit pseudo-random number.
+ */
+void
+__wt_random_init(uint32_t *rnd)
+{
+ M_W = 521288629;
+ M_Z = 362436069;
+}
+
+/*
+ * __wt_random --
+ * Return a 32-bit pseudo-random number.
+ *
+ * This is an implementation of George Marsaglia's multiply-with-carry pseudo-
+ * random number generator. Computationally fast, with reasonable randomness
+ * properties.
+ *
+ * We have to be very careful about races here. Multiple threads can call
+ * __wt_random concurrently, and it is okay if those concurrent calls get the
+ * same return value. What is *not* okay is if reading the shared state races
+ * with an update and uses two different values for m_w or m_z. That could
+ * result in a value of zero, in which case they would be stuck on zero
+ * forever. Take local copies of the shared values to avoid this.
+ */
+uint32_t
+__wt_random(uint32_t *rnd)
+{
+ uint32_t w = M_W, z = M_Z;
+
+ M_Z = z = 36969 * (z & 65535) + (z >> 16);
+ M_W = w = 18000 * (w & 65535) + (w >> 16);
+ return (z << 16) + (w & 65535);
+}
diff --git a/src/third_party/wiredtiger/src/support/scratch.c b/src/third_party/wiredtiger/src/support/scratch.c
new file mode 100644
index 00000000000..ca2cdac8377
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/scratch.c
@@ -0,0 +1,319 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_buf_grow_worker --
+ * Grow a buffer that may be in-use, and ensure that all data is local to
+ * the buffer.
+ */
+int
+__wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
+{
+ size_t offset;
+ int copy_data;
+
+ /*
+ * Maintain the existing data: there are 3 cases:
+ * No existing data: allocate the required memory, and initialize
+ * the data to reference it.
+ * Existing data local to the buffer: set the data to the same
+ * offset in the re-allocated memory.
+ * Existing data not-local to the buffer: copy the data into the
+ * buffer and set the data to reference it.
+ */
+ if (WT_DATA_IN_ITEM(buf)) {
+ offset = WT_PTRDIFF(buf->data, buf->mem);
+ copy_data = 0;
+ } else {
+ offset = 0;
+ copy_data = buf->size ? 1 : 0;
+ }
+
+ /*
+ * This function is also used to ensure data is local to the buffer,
+ * check to see if we actually need to grow anything.
+ */
+ if (size > buf->memsize) {
+ if (F_ISSET(buf, WT_ITEM_ALIGNED))
+ WT_RET(__wt_realloc_aligned(
+ session, &buf->memsize, size, &buf->mem));
+ else
+ WT_RET(__wt_realloc(
+ session, &buf->memsize, size, &buf->mem));
+ }
+
+ if (buf->data == NULL) {
+ buf->data = buf->mem;
+ buf->size = 0;
+ } else {
+ if (copy_data)
+ memcpy(buf->mem, buf->data, buf->size);
+ buf->data = (uint8_t *)buf->mem + offset;
+ }
+
+ return (0);
+}
+
+/*
+ * __wt_buf_fmt --
+ * Grow a buffer to accommodate a formatted string.
+ */
+int
+__wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+ size_t len;
+
+ for (;;) {
+ va_start(ap, fmt);
+ len = (size_t)vsnprintf(buf->mem, buf->memsize, fmt, ap);
+ va_end(ap);
+
+ /* Check if there was enough space. */
+ if (len < buf->memsize) {
+ buf->data = buf->mem;
+ buf->size = len;
+ return (0);
+ }
+
+ /*
+ * If not, double the size of the buffer: we're dealing with
+ * strings, and we don't expect these numbers to get huge.
+ */
+ WT_RET(__wt_buf_extend(session, buf, len + 1));
+ }
+}
+
+/*
+ * __wt_buf_catfmt --
+ * Grow a buffer to append a formatted string.
+ */
+int
+__wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+ size_t len, space;
+ char *p;
+
+ /*
+ * If we're appending data to an existing buffer, any data field should
+ * point into the allocated memory. (It wouldn't be insane to copy any
+ * previously existing data at this point, if data wasn't in the local
+ * buffer, but we don't and it would be bad if we didn't notice it.)
+ */
+ WT_ASSERT(session, buf->data == NULL || WT_DATA_IN_ITEM(buf));
+
+ for (;;) {
+ va_start(ap, fmt);
+ p = (char *)((uint8_t *)buf->mem + buf->size);
+ WT_ASSERT(session, buf->memsize >= buf->size);
+ space = buf->memsize - buf->size;
+ len = (size_t)vsnprintf(p, (size_t)space, fmt, ap);
+ va_end(ap);
+
+ /* Check if there was enough space. */
+ if (len < space) {
+ buf->size += len;
+ return (0);
+ }
+
+ /*
+ * If not, double the size of the buffer: we're dealing with
+ * strings, and we don't expect these numbers to get huge.
+ */
+ WT_RET(__wt_buf_extend(session, buf, buf->size + len + 1));
+ }
+}
+
+/*
+ * __wt_scr_alloc_func --
+ * Scratch buffer allocation function.
+ */
+int
+__wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_DECL_RET;
+ WT_ITEM *buf, **p, **best, **slot;
+ size_t allocated;
+ u_int i;
+
+ /* Don't risk the caller not catching the error. */
+ *scratchp = NULL;
+
+ /*
+ * Each WT_SESSION_IMPL has an array of scratch buffers available for
+ * use by any function. We use WT_ITEM structures for scratch memory
+ * because we already have functions that do variable-length allocation
+ * on a WT_ITEM. Scratch buffers are allocated only by a single thread
+ * of control, so no locking is necessary.
+ *
+ * Walk the array, looking for a buffer we can use.
+ */
+ for (i = 0, best = slot = NULL,
+ p = session->scratch; i < session->scratch_alloc; ++i, ++p) {
+ /* If we find an empty slot, remember it. */
+ if ((buf = *p) == NULL) {
+ if (slot == NULL)
+ slot = p;
+ continue;
+ }
+
+ if (F_ISSET(buf, WT_ITEM_INUSE))
+ continue;
+
+ /*
+ * If we find a buffer that's not in-use, check its size: we
+ * want the smallest buffer larger than the requested size,
+ * or the largest buffer if none are large enough.
+ */
+ if (best == NULL ||
+ ((*best)->memsize < size &&
+ buf->memsize > (*best)->memsize) ||
+ (buf->memsize >= size && buf->memsize < (*best)->memsize))
+ best = p;
+
+ /* If we find a perfect match, use it. */
+ if ((*best)->memsize == size)
+ break;
+ }
+
+ /*
+ * If we didn't find a free buffer, extend the array and use the first
+ * slot we allocated.
+ */
+ if (best == NULL && slot == NULL) {
+ allocated = session->scratch_alloc * sizeof(WT_ITEM *);
+ WT_ERR(__wt_realloc(session, &allocated,
+ (session->scratch_alloc + 10) * sizeof(WT_ITEM *),
+ &session->scratch));
+#ifdef HAVE_DIAGNOSTIC
+ allocated = session->scratch_alloc * sizeof(WT_SCRATCH_TRACK);
+ WT_ERR(__wt_realloc(session, &allocated,
+ (session->scratch_alloc + 10) * sizeof(WT_SCRATCH_TRACK),
+ &session->scratch_track));
+#endif
+ slot = session->scratch + session->scratch_alloc;
+ session->scratch_alloc += 10;
+ }
+
+ /*
+ * If slot is non-NULL, we found an empty slot, try and allocate a
+ * buffer.
+ */
+ if (best == NULL) {
+ WT_ASSERT(session, slot != NULL);
+ best = slot;
+
+ WT_ERR(__wt_calloc_def(session, 1, best));
+
+ /* Scratch buffers must be aligned. */
+ F_SET(*best, WT_ITEM_ALIGNED);
+ }
+
+ /* Grow the buffer as necessary and return. */
+ WT_ERR(__wt_buf_init(session, *best, size));
+ F_SET(*best, WT_ITEM_INUSE);
+
+#ifdef HAVE_DIAGNOSTIC
+ session->scratch_track[best - session->scratch].file = file;
+ session->scratch_track[best - session->scratch].line = line;
+#endif
+
+ *scratchp = *best;
+ return (0);
+
+err: WT_RET_MSG(session, ret,
+ "session unable to allocate a scratch buffer");
+}
+
+/*
+ * __wt_scr_discard --
+ * Free all memory associated with the scratch buffers.
+ */
+void
+__wt_scr_discard(WT_SESSION_IMPL *session)
+{
+ WT_ITEM **bufp;
+ u_int i;
+
+ for (i = 0,
+ bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp) {
+ if (*bufp == NULL)
+ continue;
+ if (F_ISSET(*bufp, WT_ITEM_INUSE))
+ __wt_errx(session,
+ "scratch buffer allocated and never discarded"
+#ifdef HAVE_DIAGNOSTIC
+ ": %s: %d",
+ session->
+ scratch_track[bufp - session->scratch].file,
+ session->
+ scratch_track[bufp - session->scratch].line
+#endif
+ );
+
+ __wt_buf_free(session, *bufp);
+ __wt_free(session, *bufp);
+ }
+
+ __wt_free(session, session->scratch);
+#ifdef HAVE_DIAGNOSTIC
+ __wt_free(session, session->scratch_track);
+#endif
+}
+
+/*
+ * __wt_ext_scr_alloc --
+ * Allocate a scratch buffer, and return the memory reference.
+ */
+void *
+__wt_ext_scr_alloc(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size)
+{
+ WT_ITEM *buf;
+ WT_SESSION_IMPL *session;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ return (__wt_scr_alloc(session, size, &buf) == 0 ? buf->mem : NULL);
+}
+
+/*
+ * __wt_ext_scr_free --
+ * Free a scratch buffer based on the memory reference.
+ */
+void
+__wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p)
+{
+ WT_ITEM **bufp;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ if ((session = (WT_SESSION_IMPL *)wt_session) == NULL)
+ session = ((WT_CONNECTION_IMPL *)wt_api->conn)->default_session;
+
+ for (i = 0,
+ bufp = session->scratch; i < session->scratch_alloc; ++i, ++bufp)
+ if (*bufp != NULL && (*bufp)->mem == p) {
+ /*
+ * Do NOT call __wt_scr_free() here, it clears the
+ * caller's pointer, which would truncate the list.
+ */
+ F_CLR(*bufp, WT_ITEM_INUSE);
+ return;
+ }
+ __wt_errx(session, "extension free'd non-existent scratch buffer");
+}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
new file mode 100644
index 00000000000..bc468fbe938
--- /dev/null
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -0,0 +1,567 @@
+/* DO NOT EDIT: automatically built by dist/stat.py. */
+
+#include "wt_internal.h"
+
+void
+__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
+{
+ /* Clear, so can also be called for reinitialization. */
+ memset(stats, 0, sizeof(*stats));
+
+ stats->allocation_size.desc =
+ "block manager: file allocation unit size";
+ stats->block_alloc.desc = "block manager: blocks allocated";
+ stats->block_checkpoint_size.desc = "block manager: checkpoint size";
+ stats->block_extension.desc =
+ "block manager: allocations requiring file extension";
+ stats->block_free.desc = "block manager: blocks freed";
+ stats->block_magic.desc = "block manager: file magic number";
+ stats->block_major.desc = "block manager: file major version number";
+ stats->block_minor.desc = "block manager: minor version number";
+ stats->block_reuse_bytes.desc =
+ "block manager: file bytes available for reuse";
+ stats->block_size.desc = "block manager: file size in bytes";
+ stats->bloom_count.desc = "LSM: bloom filters in the LSM tree";
+ stats->bloom_false_positive.desc = "LSM: bloom filter false positives";
+ stats->bloom_hit.desc = "LSM: bloom filter hits";
+ stats->bloom_miss.desc = "LSM: bloom filter misses";
+ stats->bloom_page_evict.desc =
+ "LSM: bloom filter pages evicted from cache";
+ stats->bloom_page_read.desc =
+ "LSM: bloom filter pages read into cache";
+ stats->bloom_size.desc = "LSM: total size of bloom filters";
+ stats->btree_column_deleted.desc =
+ "btree: column-store variable-size deleted values";
+ stats->btree_column_fix.desc =
+ "btree: column-store fixed-size leaf pages";
+ stats->btree_column_internal.desc =
+ "btree: column-store internal pages";
+ stats->btree_column_variable.desc =
+ "btree: column-store variable-size leaf pages";
+ stats->btree_compact_rewrite.desc =
+ "btree: pages rewritten by compaction";
+ stats->btree_entries.desc = "btree: number of key/value pairs";
+ stats->btree_fixed_len.desc = "btree: fixed-record size";
+ stats->btree_maximum_depth.desc = "btree: maximum tree depth";
+ stats->btree_maxintlitem.desc =
+ "btree: maximum internal page item size";
+ stats->btree_maxintlpage.desc = "btree: maximum internal page size";
+ stats->btree_maxleafitem.desc = "btree: maximum leaf page item size";
+ stats->btree_maxleafpage.desc = "btree: maximum leaf page size";
+ stats->btree_overflow.desc = "btree: overflow pages";
+ stats->btree_row_internal.desc = "btree: row-store internal pages";
+ stats->btree_row_leaf.desc = "btree: row-store leaf pages";
+ stats->cache_bytes_read.desc = "cache: bytes read into cache";
+ stats->cache_bytes_write.desc = "cache: bytes written from cache";
+ stats->cache_eviction_checkpoint.desc =
+ "cache: checkpoint blocked page eviction";
+ stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
+ stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
+ stats->cache_eviction_fail.desc =
+ "cache: data source pages selected for eviction unable to be evicted";
+ stats->cache_eviction_hazard.desc =
+ "cache: hazard pointer blocked page eviction";
+ stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+ stats->cache_overflow_value.desc =
+ "cache: overflow values cached in memory";
+ stats->cache_read.desc = "cache: pages read into cache";
+ stats->cache_read_overflow.desc =
+ "cache: overflow pages read into cache";
+ stats->cache_write.desc = "cache: pages written from cache";
+ stats->compress_raw_fail.desc =
+ "compression: raw compression call failed, no additional data available";
+ stats->compress_raw_fail_temporary.desc =
+ "compression: raw compression call failed, additional data available";
+ stats->compress_raw_ok.desc =
+ "compression: raw compression call succeeded";
+ stats->compress_read.desc = "compression: compressed pages read";
+ stats->compress_write.desc = "compression: compressed pages written";
+ stats->compress_write_fail.desc =
+ "compression: page written failed to compress";
+ stats->compress_write_too_small.desc =
+ "compression: page written was too small to compress";
+ stats->cursor_create.desc = "cursor: create calls";
+ stats->cursor_insert.desc = "cursor: insert calls";
+ stats->cursor_insert_bulk.desc =
+ "cursor: bulk-loaded cursor-insert calls";
+ stats->cursor_insert_bytes.desc =
+ "cursor: cursor-insert key and value bytes inserted";
+ stats->cursor_next.desc = "cursor: next calls";
+ stats->cursor_prev.desc = "cursor: prev calls";
+ stats->cursor_remove.desc = "cursor: remove calls";
+ stats->cursor_remove_bytes.desc =
+ "cursor: cursor-remove key bytes removed";
+ stats->cursor_reset.desc = "cursor: reset calls";
+ stats->cursor_search.desc = "cursor: search calls";
+ stats->cursor_search_near.desc = "cursor: search near calls";
+ stats->cursor_update.desc = "cursor: update calls";
+ stats->cursor_update_bytes.desc =
+ "cursor: cursor-update value bytes updated";
+ stats->lsm_checkpoint_throttle.desc =
+ "LSM: sleep for LSM checkpoint throttle";
+ stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree";
+ stats->lsm_generation_max.desc =
+ "LSM: highest merge generation in the LSM tree";
+ stats->lsm_lookup_no_bloom.desc =
+ "LSM: queries that could have benefited from a Bloom filter that did not exist";
+ stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
+ stats->rec_dictionary.desc = "reconciliation: dictionary matches";
+ stats->rec_multiblock_internal.desc =
+ "reconciliation: internal page multi-block writes";
+ stats->rec_multiblock_leaf.desc =
+ "reconciliation: leaf page multi-block writes";
+ stats->rec_multiblock_max.desc =
+ "reconciliation: maximum blocks required for a page";
+ stats->rec_overflow_key_internal.desc =
+ "reconciliation: internal-page overflow keys";
+ stats->rec_overflow_key_leaf.desc =
+ "reconciliation: leaf-page overflow keys";
+ stats->rec_overflow_value.desc =
+ "reconciliation: overflow values written";
+ stats->rec_page_delete.desc = "reconciliation: pages deleted";
+ stats->rec_page_match.desc = "reconciliation: page checksum matches";
+ stats->rec_pages.desc = "reconciliation: page reconciliation calls";
+ stats->rec_pages_eviction.desc =
+ "reconciliation: page reconciliation calls for eviction";
+ stats->rec_prefix_compression.desc =
+ "reconciliation: leaf page key bytes discarded using prefix compression";
+ stats->rec_suffix_compression.desc =
+ "reconciliation: internal page key bytes discarded using suffix compression";
+ stats->session_compact.desc = "session: object compaction";
+ stats->session_cursor_open.desc = "session: open cursor count";
+ stats->txn_update_conflict.desc = "txn: update conflicts";
+}
+
+void
+__wt_stat_refresh_dsrc_stats(void *stats_arg)
+{
+ WT_DSRC_STATS *stats;
+
+ stats = (WT_DSRC_STATS *)stats_arg;
+ stats->allocation_size.v = 0;
+ stats->block_alloc.v = 0;
+ stats->block_checkpoint_size.v = 0;
+ stats->block_extension.v = 0;
+ stats->block_free.v = 0;
+ stats->block_magic.v = 0;
+ stats->block_major.v = 0;
+ stats->block_minor.v = 0;
+ stats->block_reuse_bytes.v = 0;
+ stats->block_size.v = 0;
+ stats->bloom_count.v = 0;
+ stats->bloom_false_positive.v = 0;
+ stats->bloom_hit.v = 0;
+ stats->bloom_miss.v = 0;
+ stats->bloom_page_evict.v = 0;
+ stats->bloom_page_read.v = 0;
+ stats->bloom_size.v = 0;
+ stats->btree_column_deleted.v = 0;
+ stats->btree_column_fix.v = 0;
+ stats->btree_column_internal.v = 0;
+ stats->btree_column_variable.v = 0;
+ stats->btree_compact_rewrite.v = 0;
+ stats->btree_entries.v = 0;
+ stats->btree_fixed_len.v = 0;
+ stats->btree_maximum_depth.v = 0;
+ stats->btree_maxintlitem.v = 0;
+ stats->btree_maxintlpage.v = 0;
+ stats->btree_maxleafitem.v = 0;
+ stats->btree_maxleafpage.v = 0;
+ stats->btree_overflow.v = 0;
+ stats->btree_row_internal.v = 0;
+ stats->btree_row_leaf.v = 0;
+ stats->cache_bytes_read.v = 0;
+ stats->cache_bytes_write.v = 0;
+ stats->cache_eviction_checkpoint.v = 0;
+ stats->cache_eviction_clean.v = 0;
+ stats->cache_eviction_dirty.v = 0;
+ stats->cache_eviction_fail.v = 0;
+ stats->cache_eviction_hazard.v = 0;
+ stats->cache_eviction_internal.v = 0;
+ stats->cache_overflow_value.v = 0;
+ stats->cache_read.v = 0;
+ stats->cache_read_overflow.v = 0;
+ stats->cache_write.v = 0;
+ stats->compress_raw_fail.v = 0;
+ stats->compress_raw_fail_temporary.v = 0;
+ stats->compress_raw_ok.v = 0;
+ stats->compress_read.v = 0;
+ stats->compress_write.v = 0;
+ stats->compress_write_fail.v = 0;
+ stats->compress_write_too_small.v = 0;
+ stats->cursor_create.v = 0;
+ stats->cursor_insert.v = 0;
+ stats->cursor_insert_bulk.v = 0;
+ stats->cursor_insert_bytes.v = 0;
+ stats->cursor_next.v = 0;
+ stats->cursor_prev.v = 0;
+ stats->cursor_remove.v = 0;
+ stats->cursor_remove_bytes.v = 0;
+ stats->cursor_reset.v = 0;
+ stats->cursor_search.v = 0;
+ stats->cursor_search_near.v = 0;
+ stats->cursor_update.v = 0;
+ stats->cursor_update_bytes.v = 0;
+ stats->lsm_checkpoint_throttle.v = 0;
+ stats->lsm_chunk_count.v = 0;
+ stats->lsm_generation_max.v = 0;
+ stats->lsm_lookup_no_bloom.v = 0;
+ stats->lsm_merge_throttle.v = 0;
+ stats->rec_dictionary.v = 0;
+ stats->rec_multiblock_internal.v = 0;
+ stats->rec_multiblock_leaf.v = 0;
+ stats->rec_multiblock_max.v = 0;
+ stats->rec_overflow_key_internal.v = 0;
+ stats->rec_overflow_key_leaf.v = 0;
+ stats->rec_overflow_value.v = 0;
+ stats->rec_page_delete.v = 0;
+ stats->rec_page_match.v = 0;
+ stats->rec_pages.v = 0;
+ stats->rec_pages_eviction.v = 0;
+ stats->rec_prefix_compression.v = 0;
+ stats->rec_suffix_compression.v = 0;
+ stats->session_compact.v = 0;
+ stats->txn_update_conflict.v = 0;
+}
+
+void
+__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
+{
+ WT_DSRC_STATS *c, *p;
+
+ c = (WT_DSRC_STATS *)child;
+ p = (WT_DSRC_STATS *)parent;
+ p->block_alloc.v += c->block_alloc.v;
+ p->block_checkpoint_size.v += c->block_checkpoint_size.v;
+ p->block_extension.v += c->block_extension.v;
+ p->block_free.v += c->block_free.v;
+ p->block_reuse_bytes.v += c->block_reuse_bytes.v;
+ p->block_size.v += c->block_size.v;
+ p->bloom_count.v += c->bloom_count.v;
+ p->bloom_false_positive.v += c->bloom_false_positive.v;
+ p->bloom_hit.v += c->bloom_hit.v;
+ p->bloom_miss.v += c->bloom_miss.v;
+ p->bloom_page_evict.v += c->bloom_page_evict.v;
+ p->bloom_page_read.v += c->bloom_page_read.v;
+ p->bloom_size.v += c->bloom_size.v;
+ p->btree_column_deleted.v += c->btree_column_deleted.v;
+ p->btree_column_fix.v += c->btree_column_fix.v;
+ p->btree_column_internal.v += c->btree_column_internal.v;
+ p->btree_column_variable.v += c->btree_column_variable.v;
+ p->btree_compact_rewrite.v += c->btree_compact_rewrite.v;
+ p->btree_entries.v += c->btree_entries.v;
+ if (c->btree_maximum_depth.v > p->btree_maximum_depth.v)
+ p->btree_maximum_depth.v = c->btree_maximum_depth.v;
+ p->btree_overflow.v += c->btree_overflow.v;
+ p->btree_row_internal.v += c->btree_row_internal.v;
+ p->btree_row_leaf.v += c->btree_row_leaf.v;
+ p->cache_bytes_read.v += c->cache_bytes_read.v;
+ p->cache_bytes_write.v += c->cache_bytes_write.v;
+ p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v;
+ p->cache_eviction_clean.v += c->cache_eviction_clean.v;
+ p->cache_eviction_dirty.v += c->cache_eviction_dirty.v;
+ p->cache_eviction_fail.v += c->cache_eviction_fail.v;
+ p->cache_eviction_hazard.v += c->cache_eviction_hazard.v;
+ p->cache_eviction_internal.v += c->cache_eviction_internal.v;
+ p->cache_overflow_value.v += c->cache_overflow_value.v;
+ p->cache_read.v += c->cache_read.v;
+ p->cache_read_overflow.v += c->cache_read_overflow.v;
+ p->cache_write.v += c->cache_write.v;
+ p->compress_raw_fail.v += c->compress_raw_fail.v;
+ p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v;
+ p->compress_raw_ok.v += c->compress_raw_ok.v;
+ p->compress_read.v += c->compress_read.v;
+ p->compress_write.v += c->compress_write.v;
+ p->compress_write_fail.v += c->compress_write_fail.v;
+ p->compress_write_too_small.v += c->compress_write_too_small.v;
+ p->cursor_create.v += c->cursor_create.v;
+ p->cursor_insert.v += c->cursor_insert.v;
+ p->cursor_insert_bulk.v += c->cursor_insert_bulk.v;
+ p->cursor_insert_bytes.v += c->cursor_insert_bytes.v;
+ p->cursor_next.v += c->cursor_next.v;
+ p->cursor_prev.v += c->cursor_prev.v;
+ p->cursor_remove.v += c->cursor_remove.v;
+ p->cursor_remove_bytes.v += c->cursor_remove_bytes.v;
+ p->cursor_reset.v += c->cursor_reset.v;
+ p->cursor_search.v += c->cursor_search.v;
+ p->cursor_search_near.v += c->cursor_search_near.v;
+ p->cursor_update.v += c->cursor_update.v;
+ p->cursor_update_bytes.v += c->cursor_update_bytes.v;
+ p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
+ if (c->lsm_generation_max.v > p->lsm_generation_max.v)
+ p->lsm_generation_max.v = c->lsm_generation_max.v;
+ p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
+ p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
+ p->rec_dictionary.v += c->rec_dictionary.v;
+ p->rec_multiblock_internal.v += c->rec_multiblock_internal.v;
+ p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v;
+ if (c->rec_multiblock_max.v > p->rec_multiblock_max.v)
+ p->rec_multiblock_max.v = c->rec_multiblock_max.v;
+ p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
+ p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
+ p->rec_overflow_value.v += c->rec_overflow_value.v;
+ p->rec_page_delete.v += c->rec_page_delete.v;
+ p->rec_page_match.v += c->rec_page_match.v;
+ p->rec_pages.v += c->rec_pages.v;
+ p->rec_pages_eviction.v += c->rec_pages_eviction.v;
+ p->rec_prefix_compression.v += c->rec_prefix_compression.v;
+ p->rec_suffix_compression.v += c->rec_suffix_compression.v;
+ p->session_compact.v += c->session_compact.v;
+ p->session_cursor_open.v += c->session_cursor_open.v;
+ p->txn_update_conflict.v += c->txn_update_conflict.v;
+}
+
+void
+__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
+{
+ /* Clear, so can also be called for reinitialization. */
+ memset(stats, 0, sizeof(*stats));
+
+ stats->async_alloc_race.desc =
+ "async: number of allocation state races";
+ stats->async_alloc_view.desc =
+ "async: number of op slots viewed for alloc";
+ stats->async_cur_queue.desc = "async: current work queue length";
+ stats->async_flush.desc = "async: number of async flush calls";
+ stats->async_full.desc = "async: number of times op allocation failed";
+ stats->async_max_queue.desc = "async: maximum work queue length";
+ stats->async_nowork.desc =
+ "async: number of times worker found no work";
+ stats->async_op_alloc.desc = "async: op allocations";
+ stats->async_op_compact.desc = "async: op compact calls";
+ stats->async_op_insert.desc = "async: op insert calls";
+ stats->async_op_remove.desc = "async: op remove calls";
+ stats->async_op_search.desc = "async: op search calls";
+ stats->async_op_update.desc = "async: op update calls";
+ stats->block_byte_map_read.desc = "block manager: mapped bytes read";
+ stats->block_byte_read.desc = "block manager: bytes read";
+ stats->block_byte_write.desc = "block manager: bytes written";
+ stats->block_map_read.desc = "block manager: mapped blocks read";
+ stats->block_preload.desc = "block manager: blocks pre-loaded";
+ stats->block_read.desc = "block manager: blocks read";
+ stats->block_write.desc = "block manager: blocks written";
+ stats->cache_bytes_dirty.desc =
+ "cache: tracked dirty bytes in the cache";
+ stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache";
+ stats->cache_bytes_max.desc = "cache: maximum bytes configured";
+ stats->cache_bytes_read.desc = "cache: bytes read into cache";
+ stats->cache_bytes_write.desc = "cache: bytes written from cache";
+ stats->cache_eviction_checkpoint.desc =
+ "cache: checkpoint blocked page eviction";
+ stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
+ stats->cache_eviction_deepen.desc =
+ "cache: page split during eviction deepened the tree";
+ stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
+ stats->cache_eviction_fail.desc =
+ "cache: pages selected for eviction unable to be evicted";
+ stats->cache_eviction_force.desc =
+ "cache: pages evicted because they exceeded the in-memory maximum";
+ stats->cache_eviction_force_fail.desc =
+ "cache: failed eviction of pages that exceeded the in-memory maximum";
+ stats->cache_eviction_hazard.desc =
+ "cache: hazard pointer blocked page eviction";
+ stats->cache_eviction_internal.desc = "cache: internal pages evicted";
+ stats->cache_eviction_queue_empty.desc =
+ "cache: eviction server candidate queue empty when topping up";
+ stats->cache_eviction_queue_not_empty.desc =
+ "cache: eviction server candidate queue not empty when topping up";
+ stats->cache_eviction_server_evicting.desc =
+ "cache: eviction server evicting pages";
+ stats->cache_eviction_server_not_evicting.desc =
+ "cache: eviction server populating queue, but not evicting pages";
+ stats->cache_eviction_slow.desc =
+ "cache: eviction server unable to reach eviction goal";
+ stats->cache_eviction_split.desc =
+ "cache: pages split during eviction";
+ stats->cache_eviction_walk.desc = "cache: pages walked for eviction";
+ stats->cache_pages_dirty.desc =
+ "cache: tracked dirty pages in the cache";
+ stats->cache_pages_inuse.desc =
+ "cache: pages currently held in the cache";
+ stats->cache_read.desc = "cache: pages read into cache";
+ stats->cache_write.desc = "cache: pages written from cache";
+ stats->cond_wait.desc = "conn: pthread mutex condition wait calls";
+ stats->cursor_create.desc = "Btree: cursor create calls";
+ stats->cursor_insert.desc = "Btree: cursor insert calls";
+ stats->cursor_next.desc = "Btree: cursor next calls";
+ stats->cursor_prev.desc = "Btree: cursor prev calls";
+ stats->cursor_remove.desc = "Btree: cursor remove calls";
+ stats->cursor_reset.desc = "Btree: cursor reset calls";
+ stats->cursor_search.desc = "Btree: cursor search calls";
+ stats->cursor_search_near.desc = "Btree: cursor search near calls";
+ stats->cursor_update.desc = "Btree: cursor update calls";
+ stats->dh_session_handles.desc = "dhandle: session dhandles swept";
+ stats->dh_session_sweeps.desc = "dhandle: session sweep attempts";
+ stats->file_open.desc = "conn: files currently open";
+ stats->log_buffer_grow.desc = "log: log buffer size increases";
+ stats->log_buffer_size.desc = "log: total log buffer size";
+ stats->log_bytes_user.desc = "log: user provided log bytes written";
+ stats->log_bytes_written.desc = "log: log bytes written";
+ stats->log_close_yields.desc =
+ "log: yields waiting for previous log file close";
+ stats->log_max_filesize.desc = "log: maximum log file size";
+ stats->log_reads.desc = "log: log read operations";
+ stats->log_scan_records.desc = "log: records processed by log scan";
+ stats->log_scan_rereads.desc =
+ "log: log scan records requiring two reads";
+ stats->log_scans.desc = "log: log scan operations";
+ stats->log_slot_closes.desc = "log: consolidated slot closures";
+ stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
+ stats->log_slot_joins.desc = "log: consolidated slot joins";
+ stats->log_slot_races.desc = "log: consolidated slot join races";
+ stats->log_slot_switch_fails.desc =
+ "log: slots selected for switching that were unavailable";
+ stats->log_slot_toobig.desc = "log: record size exceeded maximum";
+ stats->log_slot_toosmall.desc =
+ "log: failed to find a slot large enough for record";
+ stats->log_slot_transitions.desc =
+ "log: consolidated slot join transitions";
+ stats->log_sync.desc = "log: log sync operations";
+ stats->log_writes.desc = "log: log write operations";
+ stats->lsm_checkpoint_throttle.desc =
+ "LSM: sleep for LSM checkpoint throttle";
+ stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
+ stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree";
+ stats->lsm_work_queue_app.desc =
+ "LSM: App work units currently queued";
+ stats->lsm_work_queue_manager.desc =
+ "LSM: Merge work units currently queued";
+ stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum";
+ stats->lsm_work_queue_switch.desc =
+ "LSM: Switch work units currently queued";
+ stats->lsm_work_units_created.desc =
+ "LSM: tree maintenance operations scheduled";
+ stats->lsm_work_units_discarded.desc =
+ "LSM: tree maintenance operations discarded";
+ stats->lsm_work_units_done.desc =
+ "LSM: tree maintenance operations executed";
+ stats->memory_allocation.desc = "conn: memory allocations";
+ stats->memory_free.desc = "conn: memory frees";
+ stats->memory_grow.desc = "conn: memory re-allocations";
+ stats->read_io.desc = "conn: total read I/Os";
+ stats->rec_pages.desc = "reconciliation: page reconciliation calls";
+ stats->rec_pages_eviction.desc =
+ "reconciliation: page reconciliation calls for eviction";
+ stats->rec_split_stashed_bytes.desc =
+ "reconciliation: split bytes currently awaiting free";
+ stats->rec_split_stashed_objects.desc =
+ "reconciliation: split objects currently awaiting free";
+ stats->rwlock_read.desc =
+ "conn: pthread mutex shared lock read-lock calls";
+ stats->rwlock_write.desc =
+ "conn: pthread mutex shared lock write-lock calls";
+ stats->session_cursor_open.desc = "session: open cursor count";
+ stats->session_open.desc = "session: open session count";
+ stats->txn_begin.desc = "txn: transaction begins";
+ stats->txn_checkpoint.desc = "txn: transaction checkpoints";
+ stats->txn_checkpoint_running.desc =
+ "txn: transaction checkpoint currently running";
+ stats->txn_commit.desc = "txn: transactions committed";
+ stats->txn_fail_cache.desc =
+ "txn: transaction failures due to cache overflow";
+ stats->txn_pinned_range.desc =
+ "txn: transaction range of IDs currently pinned";
+ stats->txn_rollback.desc = "txn: transactions rolled back";
+ stats->write_io.desc = "conn: total write I/Os";
+}
+
+void
+__wt_stat_refresh_connection_stats(void *stats_arg)
+{
+ WT_CONNECTION_STATS *stats;
+
+ stats = (WT_CONNECTION_STATS *)stats_arg;
+ stats->async_alloc_race.v = 0;
+ stats->async_alloc_view.v = 0;
+ stats->async_cur_queue.v = 0;
+ stats->async_flush.v = 0;
+ stats->async_full.v = 0;
+ stats->async_max_queue.v = 0;
+ stats->async_nowork.v = 0;
+ stats->async_op_alloc.v = 0;
+ stats->async_op_compact.v = 0;
+ stats->async_op_insert.v = 0;
+ stats->async_op_remove.v = 0;
+ stats->async_op_search.v = 0;
+ stats->async_op_update.v = 0;
+ stats->block_byte_map_read.v = 0;
+ stats->block_byte_read.v = 0;
+ stats->block_byte_write.v = 0;
+ stats->block_map_read.v = 0;
+ stats->block_preload.v = 0;
+ stats->block_read.v = 0;
+ stats->block_write.v = 0;
+ stats->cache_bytes_dirty.v = 0;
+ stats->cache_bytes_read.v = 0;
+ stats->cache_bytes_write.v = 0;
+ stats->cache_eviction_checkpoint.v = 0;
+ stats->cache_eviction_clean.v = 0;
+ stats->cache_eviction_deepen.v = 0;
+ stats->cache_eviction_dirty.v = 0;
+ stats->cache_eviction_fail.v = 0;
+ stats->cache_eviction_force.v = 0;
+ stats->cache_eviction_force_fail.v = 0;
+ stats->cache_eviction_hazard.v = 0;
+ stats->cache_eviction_internal.v = 0;
+ stats->cache_eviction_queue_empty.v = 0;
+ stats->cache_eviction_queue_not_empty.v = 0;
+ stats->cache_eviction_server_evicting.v = 0;
+ stats->cache_eviction_server_not_evicting.v = 0;
+ stats->cache_eviction_slow.v = 0;
+ stats->cache_eviction_split.v = 0;
+ stats->cache_eviction_walk.v = 0;
+ stats->cache_pages_dirty.v = 0;
+ stats->cache_read.v = 0;
+ stats->cache_write.v = 0;
+ stats->cond_wait.v = 0;
+ stats->cursor_create.v = 0;
+ stats->cursor_insert.v = 0;
+ stats->cursor_next.v = 0;
+ stats->cursor_prev.v = 0;
+ stats->cursor_remove.v = 0;
+ stats->cursor_reset.v = 0;
+ stats->cursor_search.v = 0;
+ stats->cursor_search_near.v = 0;
+ stats->cursor_update.v = 0;
+ stats->dh_session_handles.v = 0;
+ stats->dh_session_sweeps.v = 0;
+ stats->log_buffer_grow.v = 0;
+ stats->log_bytes_user.v = 0;
+ stats->log_bytes_written.v = 0;
+ stats->log_close_yields.v = 0;
+ stats->log_reads.v = 0;
+ stats->log_scan_records.v = 0;
+ stats->log_scan_rereads.v = 0;
+ stats->log_scans.v = 0;
+ stats->log_slot_closes.v = 0;
+ stats->log_slot_consolidated.v = 0;
+ stats->log_slot_joins.v = 0;
+ stats->log_slot_races.v = 0;
+ stats->log_slot_switch_fails.v = 0;
+ stats->log_slot_toobig.v = 0;
+ stats->log_slot_toosmall.v = 0;
+ stats->log_slot_transitions.v = 0;
+ stats->log_sync.v = 0;
+ stats->log_writes.v = 0;
+ stats->lsm_checkpoint_throttle.v = 0;
+ stats->lsm_merge_throttle.v = 0;
+ stats->lsm_rows_merged.v = 0;
+ stats->lsm_work_queue_max.v = 0;
+ stats->lsm_work_units_created.v = 0;
+ stats->lsm_work_units_discarded.v = 0;
+ stats->lsm_work_units_done.v = 0;
+ stats->memory_allocation.v = 0;
+ stats->memory_free.v = 0;
+ stats->memory_grow.v = 0;
+ stats->read_io.v = 0;
+ stats->rec_pages.v = 0;
+ stats->rec_pages_eviction.v = 0;
+ stats->rwlock_read.v = 0;
+ stats->rwlock_write.v = 0;
+ stats->txn_begin.v = 0;
+ stats->txn_checkpoint.v = 0;
+ stats->txn_commit.v = 0;
+ stats->txn_fail_cache.v = 0;
+ stats->txn_rollback.v = 0;
+ stats->write_io.v = 0;
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
new file mode 100644
index 00000000000..292d1a37ceb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_txnid_cmp --
+ * Compare transaction IDs for sorting / searching.
+ */
+int
+__wt_txnid_cmp(const void *v1, const void *v2)
+{
+ uint64_t id1, id2;
+
+ id1 = *(uint64_t *)v1;
+ id2 = *(uint64_t *)v2;
+
+ return ((id1 == id2) ? 0 : TXNID_LT(id1, id2) ? -1 : 1);
+}
+
+/*
+ * __txn_sort_snapshot --
+ * Sort a snapshot for faster searching and set the min/max bounds.
+ */
+static void
+__txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ if (n > 1)
+ qsort(txn->snapshot, n, sizeof(uint64_t), __wt_txnid_cmp);
+ txn->snapshot_count = n;
+ txn->snap_max = snap_max;
+ txn->snap_min = (n > 0 && TXNID_LE(txn->snapshot[0], snap_max)) ?
+ txn->snapshot[0] : snap_max;
+ F_SET(txn, TXN_HAS_SNAPSHOT);
+ WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
+}
+
+/*
+ * __wt_txn_release_snapshot --
+ * Release the snapshot in the current transaction.
+ */
+void
+__wt_txn_release_snapshot(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ txn_state = &S2C(session)->txn_global.states[session->id];
+
+ if (txn_state->snap_min != WT_TXN_NONE) {
+ WT_ASSERT(session,
+ session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
+ !__wt_txn_visible_all(session, txn_state->snap_min));
+ txn_state->snap_min = WT_TXN_NONE;
+ }
+ F_CLR(txn, TXN_HAS_SNAPSHOT);
+}
+
+/*
+ * __wt_txn_update_oldest --
+ * Sweep the running transactions to update the oldest ID required.
+ */
+void
+__wt_txn_update_oldest(WT_SESSION_IMPL *session)
+{
+ /*
+ * !!!
+ * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
+ * method (for the oldest transaction ID not yet visible to a running
+ * transaction), and then comparing that oldest ID against committed
+ * transactions to see if updates for a committed transaction are still
+ * visible to running transactions, the oldest transaction ID may be
+ * the same as the last committed transaction ID, if the transaction
+ * state wasn't refreshed after the last transaction committed. Push
+ * past the last committed transaction.
+ */
+ __wt_txn_refresh(session, 0);
+}
+
+/*
+ * __wt_txn_refresh --
+ * Allocate a transaction ID and/or a snapshot.
+ */
+void
+__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s, *txn_state;
+ uint64_t current_id, id, oldest_id;
+ uint64_t prev_oldest_id, snap_min;
+ uint32_t i, n, oldest_session, session_cnt;
+ int32_t count;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ current_id = snap_min = txn_global->current;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /* For pure read-only workloads, avoid scanning. */
+ if (prev_oldest_id == current_id) {
+ if (get_snapshot) {
+ txn_state->snap_min = current_id;
+ __txn_sort_snapshot(session, 0, current_id);
+ }
+ /* Check that the oldest ID has not moved in the meantime. */
+ if (prev_oldest_id == txn_global->oldest_id &&
+ txn_global->scan_count == 0)
+ return;
+ }
+
+ /*
+ * We're going to scan. Increment the count of scanners to prevent the
+ * oldest ID from moving forwards. Spin if the count is negative,
+ * which indicates that some thread is moving the oldest ID forwards.
+ */
+ do {
+ if ((count = txn_global->scan_count) < 0)
+ WT_PAUSE();
+ } while (count < 0 ||
+ !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+
+ /* The oldest ID cannot change until the scan count goes to zero. */
+ prev_oldest_id = txn_global->oldest_id;
+ current_id = oldest_id = snap_min = txn_global->current;
+ oldest_session = 0;
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /*
+ * Build our snapshot of any concurrent transaction IDs.
+ *
+ * Ignore our own ID: we always read our own updates.
+ *
+ * Also ignore the ID if it is older than the oldest ID we saw.
+ * This can happen if we race with a thread that is allocating
+ * an ID -- the ID will not be used because the thread will
+ * keep spinning until it gets a valid one.
+ */
+ if (s != txn_state &&
+ (id = s->id) != WT_TXN_NONE &&
+ TXNID_LE(prev_oldest_id, id)) {
+ if (get_snapshot)
+ txn->snapshot[n++] = id;
+ if (TXNID_LT(id, snap_min))
+ snap_min = id;
+ }
+
+ /*
+ * Ignore the session's own snap_min: we are about to update
+ * it.
+ */
+ if (get_snapshot && s == txn_state)
+ continue;
+
+ /*
+ * !!!
+ * Note: Don't ignore snap_min values older than the previous
+ * oldest ID. Read-uncommitted operations publish snap_min
+ * values without incrementing scan_count to protect the global
+ * table. See the comment in __wt_txn_cursor_op for
+ * more details.
+ */
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ TXNID_LT(id, oldest_id)) {
+ oldest_id = id;
+ oldest_session = i;
+ }
+ }
+
+ if (TXNID_LT(snap_min, oldest_id))
+ oldest_id = snap_min;
+ if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id))
+ oldest_id = txn->id;
+
+ /*
+ * If we got a new snapshot, update the published snap_min for this
+ * session.
+ */
+ if (get_snapshot) {
+ WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ txn_state->snap_min = snap_min;
+ }
+
+ /*
+ * Update the last running ID if we have a much newer value or we are
+ * forcing an update.
+ */
+ if (!get_snapshot || snap_min > txn_global->last_running + 100)
+ txn_global->last_running = snap_min;
+
+ /*
+ * Update the oldest ID if we have a newer ID and we can get exclusive
+ * access. During normal snapshot refresh, only do this if we have a
+ * much newer value. Once we get exclusive access, do another pass to
+ * make sure nobody else is using an earlier ID.
+ */
+ if (TXNID_LT(prev_oldest_id, oldest_id) &&
+ (!get_snapshot || oldest_id - prev_oldest_id > 100) &&
+ WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ if ((id = s->id) != WT_TXN_NONE &&
+ TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ if ((id = s->snap_min) != WT_TXN_NONE &&
+ TXNID_LT(id, oldest_id))
+ oldest_id = id;
+ }
+ if (TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ txn_global->scan_count = 0;
+ } else {
+ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
+ current_id - oldest_id > 10000 &&
+ txn_global->oldest_session != oldest_session) {
+ (void)__wt_verbose(session, WT_VERB_TRANSACTION,
+ "old snapshot %" PRIu64
+ " pinned in session %d [%s]"
+ " with snap_min %" PRIu64 "\n",
+ oldest_id, oldest_session,
+ conn->sessions[oldest_session].lastop,
+ conn->sessions[oldest_session].txn.snap_min);
+ txn_global->oldest_session = oldest_session;
+ }
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+ }
+
+ if (get_snapshot)
+ __txn_sort_snapshot(session, n, current_id);
+}
+
+/*
+ * __wt_txn_begin --
+ * Begin a transaction.
+ */
+int
+__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONFIG_ITEM cval;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+ if (cval.len == 0)
+ txn->isolation = session->isolation;
+ else
+ txn->isolation =
+ WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+ TXN_ISO_SNAPSHOT :
+ WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
+ TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED;
+
+ /*
+ * The default sync setting is inherited from the connection, but can
+ * be overridden by an explicit "sync" setting for this transaction.
+ */
+ txn->txn_logsync = S2C(session)->txn_logsync;
+ WT_RET(__wt_config_gets_def(session, cfg, "sync",
+ FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH), &cval));
+ if (!cval.val)
+ txn->txn_logsync = 0;
+
+ F_SET(txn, TXN_RUNNING);
+ if (txn->isolation == TXN_ISO_SNAPSHOT) {
+ if (session->ncursors > 0)
+ WT_RET(__wt_session_copy_values(session));
+ __wt_txn_refresh(session, 1);
+ }
+ return (0);
+}
+
+/*
+ * __wt_txn_release --
+ * Release the resources associated with the current transaction.
+ */
+void
+__wt_txn_release(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ WT_ASSERT(session, txn->mod_count == 0);
+ txn->notify = NULL;
+
+ txn_global = &S2C(session)->txn_global;
+ txn_state = &txn_global->states[session->id];
+
+ /* Clear the transaction's ID from the global table. */
+ if (F_ISSET(txn, TXN_HAS_ID)) {
+ WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
+ txn->id != WT_TXN_NONE);
+ WT_PUBLISH(txn_state->id, WT_TXN_NONE);
+ txn->id = WT_TXN_NONE;
+ }
+
+ /* Free the scratch buffer allocated for logging. */
+ __wt_logrec_free(session, &txn->logrec);
+
+ /* Discard any memory from the session's split stash that we can. */
+ if (session->split_stash_cnt > 0)
+ __wt_split_stash_discard(session);
+
+ /*
+ * Reset the transaction state to not running and release the snapshot.
+ */
+ __wt_txn_release_snapshot(session);
+ txn->isolation = session->isolation;
+ F_CLR(txn, TXN_ERROR | TXN_HAS_ID | TXN_RUNNING);
+}
+
+/*
+ * __wt_txn_commit --
+ * Commit the current transaction.
+ */
+int
+__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ u_int i;
+
+ txn = &session->txn;
+ WT_ASSERT(session, !F_ISSET(txn, TXN_ERROR));
+
+ if (!F_ISSET(txn, TXN_RUNNING))
+ WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+ /* Commit notification. */
+ if (txn->notify != NULL)
+ WT_TRET(txn->notify->notify(txn->notify,
+ (WT_SESSION *)session, txn->id, 1));
+
+ /* If we are logging, write a commit log record. */
+ if (ret == 0 &&
+ txn->mod_count > 0 && S2C(session)->logging &&
+ !F_ISSET(session, WT_SESSION_NO_LOGGING))
+ ret = __wt_txn_log_commit(session, cfg);
+
+ /*
+ * If anything went wrong, roll back.
+ *
+ * !!!
+ * Nothing can fail after this point.
+ */
+ if (ret != 0) {
+ WT_TRET(__wt_txn_rollback(session, cfg));
+ return (ret);
+ }
+
+ /* Free memory associated with updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
+ __wt_txn_op_free(session, op);
+ txn->mod_count = 0;
+
+ /*
+ * We are about to release the snapshot: copy values into any
+ * positioned cursors so they don't point to updates that could be
+ * freed once we don't have a transaction ID pinned.
+ */
+ if (session->ncursors > 0)
+ WT_RET(__wt_session_copy_values(session));
+
+ __wt_txn_release(session);
+ return (0);
+}
+
+/*
+ * __wt_txn_rollback --
+ * Roll back the current transaction.
+ */
+int
+__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ u_int i;
+
+ WT_UNUSED(cfg);
+
+ txn = &session->txn;
+ if (!F_ISSET(txn, TXN_RUNNING))
+ WT_RET_MSG(session, EINVAL, "No transaction is active");
+
+ /* Rollback notification. */
+ if (txn->notify != NULL)
+ WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
+ txn->id, 0));
+
+ /* Rollback updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+ /* Metadata updates are never rolled back. */
+ if (op->fileid == WT_METAFILE_ID)
+ continue;
+
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ case TXN_OP_INMEM:
+ op->u.upd->txnid = WT_TXN_ABORTED;
+ break;
+ case TXN_OP_REF:
+ __wt_delete_page_rollback(session, op->u.ref);
+ break;
+ case TXN_OP_TRUNCATE_COL:
+ case TXN_OP_TRUNCATE_ROW:
+ /*
+ * Nothing to do: these operations are only logged for
+ * recovery. The in-memory changes will be rolled back
+ * with a combination of TXN_OP_REF and TXN_OP_INMEM
+ * operations.
+ */
+ break;
+ }
+
+ /* Free any memory allocated for the operation. */
+ __wt_txn_op_free(session, op);
+ }
+ txn->mod_count = 0;
+
+ __wt_txn_release(session);
+ return (ret);
+}
+
+/*
+ * __wt_txn_init --
+ * Initialize a session's transaction data.
+ */
+int
+__wt_txn_init(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ txn->id = WT_TXN_NONE;
+
+ WT_RET(__wt_calloc_def(session,
+ S2C(session)->session_size, &txn->snapshot));
+
+#ifdef HAVE_DIAGNOSTIC
+ if (S2C(session)->txn_global.states != NULL) {
+ WT_TXN_STATE *txn_state;
+ txn_state = &S2C(session)->txn_global.states[session->id];
+ WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE);
+ }
+#endif
+
+ /*
+ * Take care to clean these out in case we are reusing the transaction
+ * for eviction.
+ */
+ txn->mod = NULL;
+
+ txn->isolation = session->isolation;
+ return (0);
+}
+
+/*
+ * __wt_txn_stats_update --
+ * Update the transaction statistics for return to the application.
+ */
+void
+__wt_txn_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_TXN_GLOBAL *txn_global;
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS *stats;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ stats = &conn->stats;
+
+ WT_STAT_SET(stats, txn_pinned_range,
+ txn_global->current - txn_global->oldest_id);
+}
+
+/*
+ * __wt_txn_destroy --
+ * Destroy a session's transaction data.
+ */
+void
+__wt_txn_destroy(WT_SESSION_IMPL *session)
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ __wt_free(session, txn->mod);
+ __wt_free(session, txn->snapshot);
+}
+
+/*
+ * __wt_txn_global_init --
+ * Initialize the global transaction state.
+ */
+int
+__wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ u_int i;
+
+ WT_UNUSED(cfg);
+ conn = S2C(session);
+
+ txn_global = &conn->txn_global;
+ txn_global->current = 1;
+ txn_global->oldest_id = 1;
+ txn_global->last_running = 1;
+
+ WT_RET(__wt_calloc_def(
+ session, conn->session_size, &txn_global->states));
+ for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
+ s->id = s->snap_min = WT_TXN_NONE;
+
+ return (0);
+}
+
+/*
+ * __wt_txn_global_destroy --
+ * Destroy the global transaction state.
+ */
+void
+__wt_txn_global_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ if (txn_global != NULL)
+ __wt_free(session, txn_global->states);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
new file mode 100644
index 00000000000..555eec649c6
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -0,0 +1,944 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_checkpoint_name_ok --
+ * Complain if the checkpoint name isn't acceptable.
+ */
+int
+__wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len)
+{
+ /* Check for characters we don't want to see in a metadata file. */
+ WT_RET(__wt_name_check(session, name, len));
+
+ /*
+ * The internal checkpoint name is special, applications aren't allowed
+ * to use it. Be aggressive and disallow any matching prefix, it makes
+ * things easier when checking in other places.
+ */
+ if (len < strlen(WT_CHECKPOINT))
+ return (0);
+ if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT))
+ return (0);
+
+ WT_RET_MSG(session, EINVAL,
+ "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT);
+}
+
+/*
+ * __checkpoint_name_check --
+ * Check for an attempt to name a checkpoint that includes anything
+ * other than a file object.
+ */
+static int
+__checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *fail;
+
+ cursor = NULL;
+ fail = NULL;
+
+ /*
+ * This function exists as a place for this comment: named checkpoints
+ * are only supported on file objects, and not on LSM trees or Helium
+ * devices. If a target list is configured for the checkpoint, this
+ * function is called with each target list entry; check the entry to
+ * make sure it's backed by a file. If no target list is configured,
+ * confirm the metadata file contains no non-file objects.
+ */
+ if (uri == NULL) {
+ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:")) {
+ fail = uri;
+ break;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ } else
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
+ !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") &&
+ !WT_PREFIX_MATCH(uri, "table:"))
+ fail = uri;
+
+ if (fail != NULL)
+ WT_ERR_MSG(session, EINVAL,
+ "%s object does not support named checkpoints", fail);
+
+err: if (cursor != NULL)
+ WT_TRET(cursor->close(cursor));
+ return (ret);
+}
+
+/*
+ * __checkpoint_apply --
+ * Apply an operation to all files involved in a checkpoint.
+ */
+static int
+__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
+ int (*op)(WT_SESSION_IMPL *, const char *[]), int *fullp)
+{
+ WT_CONFIG targetconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ int ckpt_closed, named, target_list;
+
+ target_list = 0;
+
+ /* Flag if this is a named checkpoint, and check if the name is OK. */
+ WT_RET(__wt_config_gets(session, cfg, "name", &cval));
+ named = cval.len != 0;
+ if (named)
+ WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+
+ /* Step through the targets and optionally operate on each one. */
+ WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
+ WT_ERR(__wt_config_subinit(session, &targetconf, &cval));
+ while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) {
+ if (!target_list) {
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ target_list = 1;
+ }
+
+ if (v.len != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "invalid checkpoint target %.*s: URIs may require "
+ "quoting",
+ (int)cval.len, (char *)cval.str);
+
+ /* Some objects don't support named checkpoints. */
+ if (named)
+ WT_ERR(__checkpoint_name_check(session, k.str));
+
+ if (op == NULL)
+ continue;
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+ if ((ret = __wt_schema_worker(
+ session, tmp->data, op, NULL, cfg, 0)) != 0)
+ WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ if (!target_list && named)
+ /* Some objects don't support named checkpoints. */
+ WT_ERR(__checkpoint_name_check(session, NULL));
+
+ if (!target_list && op != NULL) {
+ /*
+ * If the checkpoint is named or we're dropping checkpoints, we
+ * checkpoint both open and closed files; else, only checkpoint
+ * open files.
+ *
+ * XXX
+ * We don't optimize unnamed checkpoints of a list of targets,
+ * we open the targets and checkpoint them even if they are
+ * quiescent and don't need a checkpoint, believing applications
+ * unlikely to checkpoint a list of closed targets.
+ */
+ ckpt_closed = named;
+ if (!ckpt_closed) {
+ WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+ ckpt_closed = cval.len != 0;
+ }
+ WT_ERR(ckpt_closed ?
+ __wt_meta_btree_apply(session, op, cfg) :
+ __wt_conn_btree_apply(session, 0, op, cfg));
+ }
+
+ if (fullp != NULL)
+ *fullp = !target_list;
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __checkpoint_data_source --
+ * Checkpoint all data sources.
+ */
+static int
+__checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_NAMED_DATA_SOURCE *ndsrc;
+ WT_DATA_SOURCE *dsrc;
+
+ /*
+ * A place-holder, to support Helium devices: we assume calling the
+ * underlying data-source session checkpoint function is sufficient to
+ * checkpoint all objects in the data source, open or closed, and we
+ * don't attempt to optimize the checkpoint of individual targets.
+ * Those assumptions is correct for the Helium device, but it's not
+ * necessarily going to be true for other data sources.
+ *
+ * It's not difficult to support data-source checkpoints of individual
+ * targets (__wt_schema_worker is the underlying function that will do
+ * the work, and it's already written to support data-sources, although
+ * we'd probably need to pass the URI of the object to the data source
+ * checkpoint function which we don't currently do). However, doing a
+ * full data checkpoint is trickier: currently, the connection code is
+ * written to ignore all objects other than "file:", and that code will
+ * require significant changes to work with data sources.
+ */
+ TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) {
+ dsrc = ndsrc->dsrc;
+ if (dsrc->checkpoint != NULL)
+ WT_RET(dsrc->checkpoint(dsrc,
+ (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg));
+ }
+ return (0);
+}
+
+/*
+ * __wt_checkpoint_list --
+ * Get a list of handles to flush.
+ */
+int
+__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
+ const char *name;
+
+ WT_UNUSED(cfg);
+
+ /* Should not be called with anything other than a file object. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+ WT_ASSERT(session,
+ memcmp(session->dhandle->name, "file:", strlen("file:")) == 0);
+
+ /* Make sure there is space for the next entry. */
+ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
+ session->ckpt_handle_next + 1, &session->ckpt_handle));
+
+ /* Not strictly necessary, but cleaner to clear the current handle. */
+ name = session->dhandle->name;
+ saved_dhandle = session->dhandle;
+ session->dhandle = NULL;
+
+ /* Ignore busy files, we'll deal with them in the checkpoint. */
+ switch (ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) {
+ case 0:
+ session->ckpt_handle[
+ session->ckpt_handle_next++] = session->dhandle;
+ break;
+ case EBUSY:
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ session->dhandle = saved_dhandle;
+ return (ret);
+}
+
+/*
+ * __checkpoint_write_leaves --
+ * Write any dirty leaf pages for all checkpoint handles.
+ */
+static int
+__checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ u_int i;
+
+ i = 0;
+
+ /* Should not be called with any handle reference. */
+ WT_ASSERT(session, session->dhandle == NULL);
+
+ /*
+ * Get a list of handles we want to flush; this may pull closed objects
+ * into the session cache, but we're going to do that eventually anyway.
+ */
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __checkpoint_apply(session, cfg, __wt_checkpoint_list, NULL));
+ WT_ERR(ret);
+
+ /*
+ * Walk the list, flushing the leaf pages from each file, then releasing
+ * the file. Note that we increment inside the loop to simplify error
+ * handling.
+ */
+ while (i < session->ckpt_handle_next) {
+ dhandle = session->ckpt_handle[i++];
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES));
+ WT_WITH_DHANDLE(session, dhandle,
+ WT_TRET(__wt_session_release_btree(session)));
+ WT_ERR(ret);
+ }
+
+err: while (i < session->ckpt_handle_next) {
+ dhandle = session->ckpt_handle[i++];
+ WT_WITH_DHANDLE(session, dhandle,
+ WT_TRET(__wt_session_release_btree(session)));
+ }
+ __wt_free(session, session->ckpt_handle);
+ session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
+ return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint --
+ * Checkpoint a database or a list of objects in the database.
+ */
+int
+__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_ISOLATION saved_isolation;
+ int full, logging, tracking;
+ const char *txn_cfg[] =
+ { WT_CONFIG_BASE(session, session_begin_transaction),
+ "isolation=snapshot", NULL };
+ void *saved_meta_next;
+
+ conn = S2C(session);
+ saved_isolation = session->isolation;
+ txn = &session->txn;
+ full = logging = tracking = 0;
+
+ /*
+ * Do a pass over the configuration arguments and figure out what kind
+ * kind of checkpoint this is.
+ */
+ WT_RET(__checkpoint_apply(session, cfg, NULL, &full));
+
+ /*
+ * Update the global oldest ID so we do all possible cleanup.
+ *
+ * This is particularly important for compact, so that all dirty pages
+ * can be fully written.
+ */
+ __wt_txn_update_oldest(session);
+
+ /* Flush data-sources before we start the checkpoint. */
+ WT_ERR(__checkpoint_data_source(session, cfg));
+
+ /* Flush dirty leaf pages before we start the checkpoint. */
+ session->isolation = txn->isolation = TXN_ISO_READ_COMMITTED;
+ WT_ERR(__checkpoint_write_leaves(session, cfg));
+
+ /* Acquire the schema lock. */
+ F_SET(session, WT_SESSION_SCHEMA_LOCKED);
+ __wt_spin_lock(session, &conn->schema_lock);
+
+ WT_ERR(__wt_meta_track_on(session));
+ tracking = 1;
+
+ /* Tell logging that we are about to start a database checkpoint. */
+ if (conn->logging && full)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));
+
+ /*
+ * Start a snapshot transaction for the checkpoint.
+ *
+ * Note: we don't go through the public API calls because they have
+ * side effects on cursors, which applications can hold open across
+ * calls to checkpoint.
+ */
+ WT_ERR(__wt_txn_begin(session, txn_cfg));
+
+ /* Tell logging that we have started a database checkpoint. */
+ if (conn->logging && full) {
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, full, WT_TXN_LOG_CKPT_START, NULL));
+ logging = 1;
+ }
+
+ WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint, NULL));
+
+ /* Commit the transaction before syncing the file(s). */
+ WT_ERR(__wt_txn_commit(session, NULL));
+
+ /*
+ * Checkpoints have to hit disk (it would be reasonable to configure for
+ * lazy checkpoints, but we don't support them yet).
+ */
+ if (F_ISSET(conn, WT_CONN_CKPT_SYNC))
+ WT_ERR(__checkpoint_apply(
+ session, cfg, __wt_checkpoint_sync, NULL));
+
+ /* Checkpoint the metadata file. */
+ SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (WT_IS_METADATA(dhandle) ||
+ !WT_PREFIX_MATCH(dhandle->name, "file:"))
+ break;
+ }
+ if (dhandle == NULL)
+ WT_ERR_MSG(session, EINVAL,
+ "checkpoint unable to find open meta-data handle");
+
+ /*
+ * Disable metadata tracking during the metadata checkpoint.
+ *
+ * We don't lock old checkpoints in the metadata file: there is no way
+ * to open one. We are holding other handle locks, it is not safe to
+ * lock conn->spinlock.
+ */
+ session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED;
+ saved_meta_next = session->meta_track_next;
+ session->meta_track_next = NULL;
+ WT_WITH_DHANDLE(session, dhandle, ret = __wt_checkpoint(session, cfg));
+ session->meta_track_next = saved_meta_next;
+
+err: /*
+ * XXX
+ * Rolling back the changes here is problematic.
+ *
+ * If we unroll here, we need a way to roll back changes to the avail
+ * list for each tree that was successfully synced before the error
+ * occurred. Otherwise, the next time we try this operation, we will
+ * try to free an old checkpoint again.
+ *
+ * OTOH, if we commit the changes after a failure, we have partially
+ * overwritten the checkpoint, so what ends up on disk is not
+ * consistent.
+ */
+ session->isolation = txn->isolation = TXN_ISO_READ_UNCOMMITTED;
+ if (tracking)
+ WT_TRET(__wt_meta_track_off(session, ret != 0));
+
+ if (F_ISSET(txn, TXN_RUNNING))
+ WT_TRET(__wt_txn_rollback(session, NULL));
+
+ /* Tell logging that we have finished a database checkpoint. */
+ if (logging)
+ WT_TRET(__wt_txn_checkpoint_log(session, full,
+ (ret == 0) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_FAIL,
+ NULL));
+
+ if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) {
+ F_CLR(session, WT_SESSION_SCHEMA_LOCKED);
+ __wt_spin_unlock(session, &conn->schema_lock);
+ }
+
+ session->isolation = txn->isolation = saved_isolation;
+
+ return (ret);
+}
+
+/*
+ * __drop --
+ * Drop all checkpoints with a specific name.
+ */
+static void
+__drop(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+ WT_CKPT *ckpt;
+
+ /*
+ * If we're dropping internal checkpoints, match to the '.' separating
+ * the checkpoint name from the generational number, and take all that
+ * we can find. Applications aren't allowed to use any variant of this
+ * name, so the test is still pretty simple, if the leading bytes match,
+ * it's one we want to drop.
+ */
+ if (strncmp(WT_CHECKPOINT, name, len) == 0) {
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+ F_SET(ckpt, WT_CKPT_DELETE);
+ } else
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (WT_STRING_MATCH(ckpt->name, name, len))
+ F_SET(ckpt, WT_CKPT_DELETE);
+}
+
+/*
+ * __drop_from --
+ * Drop all checkpoints after, and including, the named checkpoint.
+ */
+static void
+__drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+ WT_CKPT *ckpt;
+ int matched;
+
+ /*
+ * There's a special case -- if the name is "all", then we delete all
+ * of the checkpoints.
+ */
+ if (WT_STRING_MATCH("all", name, len)) {
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ F_SET(ckpt, WT_CKPT_DELETE);
+ return;
+ }
+
+ /*
+ * We use the first checkpoint we can find, that is, if there are two
+ * checkpoints with the same name in the list, we'll delete from the
+ * first match to the end.
+ */
+ matched = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!matched && !WT_STRING_MATCH(ckpt->name, name, len))
+ continue;
+
+ matched = 1;
+ F_SET(ckpt, WT_CKPT_DELETE);
+ }
+}
+
+/*
+ * __drop_to --
+ * Drop all checkpoints before, and including, the named checkpoint.
+ */
+static void
+__drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
+{
+ WT_CKPT *ckpt, *mark;
+
+ /*
+ * We use the last checkpoint we can find, that is, if there are two
+ * checkpoints with the same name in the list, we'll delete from the
+ * beginning to the second match, not the first.
+ */
+ mark = NULL;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (WT_STRING_MATCH(ckpt->name, name, len))
+ mark = ckpt;
+
+ if (mark == NULL)
+ return;
+
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ F_SET(ckpt, WT_CKPT_DELETE);
+
+ if (ckpt == mark)
+ break;
+ }
+}
+
+/*
+ * __checkpoint_worker --
+ * Checkpoint a tree.
+ */
+static int
+__checkpoint_worker(
+ WT_SESSION_IMPL *session, const char *cfg[], int is_checkpoint)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CKPT *ckpt, *ckptbase;
+ WT_CONFIG dropconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_LSN ckptlsn;
+ const char *name;
+ int deleted, force, hot_backup_locked, track_ckpt, was_modified;
+ char *name_alloc;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ conn = S2C(session);
+ ckpt = ckptbase = NULL;
+ INIT_LSN(&ckptlsn);
+ dhandle = session->dhandle;
+ name_alloc = NULL;
+ hot_backup_locked = 0;
+ name_alloc = NULL;
+ track_ckpt = 1;
+ was_modified = btree->modified;
+
+ /* Get the list of checkpoints for this file. */
+ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase));
+
+ /* This may be a named checkpoint, check the configuration. */
+ cval.len = 0;
+ if (cfg != NULL)
+ WT_ERR(__wt_config_gets(session, cfg, "name", &cval));
+ if (cval.len == 0)
+ name = WT_CHECKPOINT;
+ else {
+ WT_ERR(__wt_checkpoint_name_ok(session, cval.str, cval.len));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc));
+ name = name_alloc;
+ }
+
+ /* We may be dropping specific checkpoints, check the configuration. */
+ if (cfg != NULL) {
+ cval.len = 0;
+ WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+ if (cval.len != 0) {
+ WT_ERR(__wt_config_subinit(session, &dropconf, &cval));
+ while ((ret =
+ __wt_config_next(&dropconf, &k, &v)) == 0) {
+ /* Disallow unsafe checkpoint names. */
+ if (v.len == 0)
+ WT_ERR(__wt_checkpoint_name_ok(
+ session, k.str, k.len));
+ else
+ WT_ERR(__wt_checkpoint_name_ok(
+ session, v.str, v.len));
+
+ if (v.len == 0)
+ __drop(ckptbase, k.str, k.len);
+ else if (WT_STRING_MATCH("from", k.str, k.len))
+ __drop_from(ckptbase, v.str, v.len);
+ else if (WT_STRING_MATCH("to", k.str, k.len))
+ __drop_to(ckptbase, v.str, v.len);
+ else
+ WT_ERR_MSG(session, EINVAL,
+ "unexpected value for checkpoint "
+ "key: %.*s",
+ (int)k.len, k.str);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ }
+
+ /* Drop checkpoints with the same name as the one we're taking. */
+ __drop(ckptbase, name, strlen(name));
+
+ /*
+ * Check for clean objects not requiring a checkpoint.
+ *
+ * If we're closing a handle, and the object is clean, we can skip the
+ * checkpoint, whatever checkpoints we have are sufficient. (We might
+ * not have any checkpoints if the object was never modified, and that's
+ * OK: the object creation code doesn't mark the tree modified so we can
+ * skip newly created trees here.)
+ *
+ * If the application repeatedly checkpoints an object (imagine hourly
+ * checkpoints using the same explicit or internal name), there's no
+ * reason to repeat the checkpoint for clean objects. The test is if
+ * the only checkpoint we're deleting is the last one in the list and
+ * it has the same name as the checkpoint we're about to take, skip the
+ * work. (We can't skip checkpoints that delete more than the last
+ * checkpoint because deleting those checkpoints might free up space in
+ * the file.) This means an application toggling between two (or more)
+ * checkpoint names will repeatedly take empty checkpoints, but that's
+ * not likely enough to make detection worthwhile.
+ *
+ * Checkpoint read-only objects otherwise: the application must be able
+ * to open the checkpoint in a cursor after taking any checkpoint, which
+ * means it must exist.
+ */
+ force = 0;
+ if (!btree->modified && cfg != NULL) {
+ ret = __wt_config_gets(session, cfg, "force", &cval);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_ERR(ret);
+ if (ret == 0 && cval.val != 0)
+ force = 1;
+ }
+ if (!btree->modified && !force) {
+ if (!is_checkpoint)
+ goto done;
+
+ deleted = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_DELETE))
+ ++deleted;
+ /*
+ * Complicated test: if we only deleted a single checkpoint, and
+ * it was the last checkpoint in the object, and it has the same
+ * name as the checkpoint we're taking (correcting for internal
+ * checkpoint names with their generational suffix numbers), we
+ * can skip the checkpoint, there's nothing to do.
+ */
+ if (deleted == 1 &&
+ F_ISSET(ckpt - 1, WT_CKPT_DELETE) &&
+ (strcmp(name, (ckpt - 1)->name) == 0 ||
+ (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+ WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))))
+ goto done;
+ }
+
+ /* Add a new checkpoint entry at the end of the list. */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ ;
+ WT_ERR(__wt_strdup(session, name, &ckpt->name));
+ F_SET(ckpt, WT_CKPT_ADD);
+
+ /*
+ * We can't delete checkpoints if a backup cursor is open. WiredTiger
+ * checkpoints are uniquely named and it's OK to have multiple of them
+ * in the system: clear the delete flag for them, and otherwise fail.
+ * Hold the lock until we're done (blocking hot backups from starting),
+ * we don't want to race with a future hot backup.
+ */
+ __wt_spin_lock(session, &conn->hot_backup_lock);
+ hot_backup_locked = 1;
+ if (conn->hot_backup)
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+ F_CLR(ckpt, WT_CKPT_DELETE);
+ continue;
+ }
+ WT_ERR_MSG(session, EBUSY,
+ "checkpoint %s blocked by hot backup: it would "
+ "delete an existing checkpoint, and checkpoints "
+ "cannot be deleted during a hot backup",
+ ckpt->name);
+ }
+
+ /*
+ * Lock the checkpoints that will be deleted.
+ *
+ * Checkpoints are only locked when tracking is enabled, which covers
+ * checkpoint and drop operations, but not close. The reasoning is
+ * there should be no access to a checkpoint during close, because any
+ * thread accessing a checkpoint will also have the current file handle
+ * open.
+ */
+ if (WT_META_TRACKING(session))
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+
+ /*
+ * We can't delete checkpoints referenced by a cursor.
+ * WiredTiger checkpoints are uniquely named and it's
+ * OK to have multiple in the system: clear the delete
+ * flag for them, and otherwise fail.
+ */
+ ret = __wt_session_lock_checkpoint(session, ckpt->name);
+ if (ret == 0)
+ continue;
+ if (ret == EBUSY &&
+ WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+ F_CLR(ckpt, WT_CKPT_DELETE);
+ continue;
+ }
+ WT_ERR_MSG(session, ret,
+ "checkpoints cannot be dropped when in-use");
+ }
+
+ /*
+ * There are special files: those being bulk-loaded, salvaged, upgraded
+ * or verified during the checkpoint. We have to do something for those
+ * objects because a checkpoint is an external name the application can
+ * reference and the name must exist no matter what's happening during
+ * the checkpoint. For bulk-loaded files, we could block until the load
+ * completes, checkpoint the partial load, or magic up an empty-file
+ * checkpoint. The first is too slow, the second is insane, so do the
+ * third.
+ * Salvage, upgrade and verify don't currently require any work, all
+ * three hold the schema lock, blocking checkpoints. If we ever want to
+ * fix that (and I bet we eventually will, at least for verify), we can
+ * copy the last checkpoint the file has. That works if we guarantee
+ * salvage, upgrade and verify act on objects with previous checkpoints
+ * (true if handles are closed/re-opened between object creation and a
+ * subsequent salvage, upgrade or verify operation). Presumably,
+ * salvage and upgrade will discard all previous checkpoints when they
+ * complete, which is fine with us. This change will require reference
+ * counting checkpoints, and once that's done, we should use checkpoint
+ * copy instead of forcing checkpoints on clean objects to associate
+ * names with checkpoints.
+ */
+ if (is_checkpoint)
+ switch (F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) {
+ case 0:
+ break;
+ case WT_BTREE_BULK:
+ /*
+ * The only checkpoints a bulk-loaded file should have
+ * are fake ones we created without the underlying block
+ * manager. I'm leaving this code here because it's a
+ * cheap test and a nasty race.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE))
+ WT_ERR_MSG(session, ret,
+ "block-manager checkpoint found "
+ "for a bulk-loaded file");
+ track_ckpt = 0;
+ goto fake;
+ case WT_BTREE_SALVAGE:
+ case WT_BTREE_UPGRADE:
+ case WT_BTREE_VERIFY:
+ WT_ERR_MSG(session, EINVAL,
+ "checkpoints are blocked during salvage, upgrade "
+ "or verify operations");
+ }
+
+ /*
+ * If an object has never been used (in other words, if it could become
+ * a bulk-loaded file), then we must fake the checkpoint. This is good
+ * because we don't write physical checkpoint blocks for just-created
+ * files, but it's not just a good idea. The reason is because deleting
+ * a physical checkpoint requires writing the file, and fake checkpoints
+ * can't write the file. If you (1) create a physical checkpoint for an
+ * empty file which writes blocks, (2) start bulk-loading records into
+ * the file, (3) during the bulk-load perform another checkpoint with
+ * the same name; in order to keep from having two checkpoints with the
+ * same name you would have to use the bulk-load's fake checkpoint to
+ * delete a physical checkpoint, and that will end in tears.
+ */
+ if (is_checkpoint)
+ if (btree->bulk_load_ok) {
+ track_ckpt = 0;
+ goto fake;
+ }
+
+ /*
+ * Mark the root page dirty to ensure something gets written. (If the
+ * tree is modified, we must write the root page anyway, this doesn't
+ * add additional writes to the process. If the tree is not modified,
+ * we have to dirty the root page to ensure something gets written.)
+ * This is really about paranoia: if the tree modification value gets
+ * out of sync with the set of dirty pages (modify is set, but there
+ * are no dirty pages), we perform a checkpoint without any writes, no
+ * checkpoint is created, and then things get bad.
+ */
+ WT_ERR(__wt_page_modify_init(session, btree->root.page));
+ __wt_page_modify_set(session, btree->root.page);
+
+ /*
+ * Clear the tree's modified flag; any changes before we clear the flag
+ * are guaranteed to be part of this checkpoint (unless reconciliation
+ * skips updates for transactional reasons), and changes subsequent to
+ * the checkpoint start, which might not be included, will re-set the
+ * modified flag. The "unless reconciliation skips updates" problem is
+ * handled in the reconciliation code: if reconciliation skips updates,
+ * it sets the modified flag itself. Use a full barrier so we get the
+ * store done quickly, this isn't a performance path.
+ */
+ btree->modified = 0;
+ WT_FULL_BARRIER();
+
+ /* Tell logging that a file checkpoint is starting. */
+ if (conn->logging)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, 0, WT_TXN_LOG_CKPT_START, &ckptlsn));
+
+ /* Flush the file from the cache, creating the checkpoint. */
+ if (is_checkpoint)
+ WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT));
+ else
+ WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE));
+
+ /*
+ * All blocks being written have been written; set the object's write
+ * generation.
+ */
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD))
+ ckpt->write_gen = btree->write_gen;
+
+fake: /* Update the object's metadata. */
+ WT_ERR(__wt_meta_ckptlist_set(
+ session, dhandle->name, ckptbase, &ckptlsn));
+
+ /*
+ * If we wrote a checkpoint (rather than faking one), pages may be
+ * available for re-use. If tracking enabled, defer making pages
+ * available until transaction end. The exception is if the handle
+ * is being discarded, in which case the handle will be gone by the
+ * time we try to apply or unroll the meta tracking event.
+ */
+ if (track_ckpt) {
+ if (WT_META_TRACKING(session) && is_checkpoint)
+ WT_ERR(__wt_meta_track_checkpoint(session));
+ else
+ WT_ERR(bm->checkpoint_resolve(bm, session));
+ }
+
+ /* Tell logging that the checkpoint is complete. */
+ if (conn->logging)
+ WT_ERR(__wt_txn_checkpoint_log(
+ session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
+
+done: err:
+ /*
+ * If the checkpoint didn't complete successfully, make sure the
+ * tree is marked dirty.
+ */
+ if (ret != 0 && !btree->modified && was_modified)
+ btree->modified = 1;
+
+ if (hot_backup_locked)
+ __wt_spin_unlock(session, &conn->hot_backup_lock);
+
+ __wt_meta_ckptlist_free(session, ckptbase);
+ __wt_free(session, name_alloc);
+
+ return (ret);
+}
+
+/*
+ * __wt_checkpoint --
+ * Checkpoint a file.
+ */
+int
+__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ /* Should not be called with a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+ /* Should be holding the schema lock. */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));
+
+ return (__checkpoint_worker(session, cfg, 1));
+}
+
+/*
+ * __wt_checkpoint_sync --
+ * Sync a file that has been checkpointed, and wait for the result.
+ */
+int
+__wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_BM *bm;
+
+ WT_UNUSED(cfg);
+
+ bm = S2BT(session)->bm;
+
+ /* Should not be called with a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+ /* Should have an underlying block manager reference. */
+ WT_ASSERT(session, bm != NULL);
+
+ return (bm->sync(bm, session, 0));
+}
+
+/*
+ * __wt_checkpoint_close --
+ * Checkpoint a single file as part of closing the handle.
+ */
+int
+__wt_checkpoint_close(WT_SESSION_IMPL *session, int force)
+{
+ /* If closing an unmodified file, simply discard its blocks. */
+ if (!S2BT(session)->modified || force)
+ return (__wt_cache_op(session, NULL,
+ force ? WT_SYNC_DISCARD_FORCE : WT_SYNC_DISCARD));
+
+ /*
+ * Else, checkpoint the file and optionally flush the writes (the
+ * checkpoint call will discard the blocks, there's no additional
+ * step needed).
+ */
+ WT_RET(__checkpoint_worker(session, NULL, 0));
+ if (F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
+ WT_RET(__wt_checkpoint_sync(session, NULL));
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c
new file mode 100644
index 00000000000..31d5506be5b
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_ext.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_ext_transaction_id --
+ * Return the session's transaction ID.
+ */
+uint64_t
+__wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+{
+ WT_SESSION_IMPL *session;
+
+ (void)wt_api; /* Unused parameters */
+ session = (WT_SESSION_IMPL *)wt_session;
+ /* Ignore failures: the only case is running out of transaction IDs. */
+ (void)__wt_txn_id_check(session);
+ return (session->txn.id);
+}
+
+/*
+ * __wt_ext_transaction_isolation_level --
+ * Return if the current transaction's isolation level.
+ */
+int
+__wt_ext_transaction_isolation_level(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+{
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ (void)wt_api; /* Unused parameters */
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ txn = &session->txn;
+
+ if (txn->isolation == TXN_ISO_READ_COMMITTED)
+ return (WT_TXN_ISO_READ_COMMITTED);
+ if (txn->isolation == TXN_ISO_READ_UNCOMMITTED)
+ return (WT_TXN_ISO_READ_UNCOMMITTED);
+ return (WT_TXN_ISO_SNAPSHOT);
+}
+
+/*
+ * __wt_ext_transaction_notify --
+ * Request notification of transaction resolution.
+ */
+int
+__wt_ext_transaction_notify(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify)
+{
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
+
+ (void)wt_api; /* Unused parameters */
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ txn = &session->txn;
+
+ /*
+ * XXX
+ * For now, a single slot for notifications: I'm not bothering with
+ * more than one because more than one data-source in a transaction
+ * doesn't work anyway.
+ */
+ if (txn->notify == notify)
+ return (0);
+ if (txn->notify != NULL)
+ return (ENOMEM);
+
+ txn->notify = notify;
+
+ return (0);
+}
+
+/*
+ * __wt_ext_transaction_oldest --
+ * Return the oldest transaction ID not yet visible to a running
+ * transaction.
+ */
+uint64_t
+__wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api)
+{
+ return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id);
+}
+
+/*
+ * __wt_ext_transaction_visible --
+ * Return if the current transaction can see the given transaction ID.
+ */
+int
+__wt_ext_transaction_visible(
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id)
+{
+ (void)wt_api; /* Unused parameters */
+
+ return (__wt_txn_visible(
+ (WT_SESSION_IMPL *)wt_session, transaction_id));
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
new file mode 100644
index 00000000000..03a71056a9a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -0,0 +1,500 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __txn_op_log --
+ * Log an operation for the current transaction.
+ */
+static int
+__txn_op_log(WT_SESSION_IMPL *session,
+ WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt)
+{
+ WT_DECL_RET;
+ WT_ITEM key, value;
+ WT_UPDATE *upd;
+ uint64_t recno;
+
+ WT_CLEAR(key);
+ upd = op->u.upd;
+ value.data = WT_UPDATE_DATA(upd);
+ value.size = upd->size;
+
+ /*
+ * Log the operation. It must be one of the following:
+ * 1) column store remove;
+ * 2) column store insert/update;
+ * 3) row store remove; or
+ * 4) row store insert/update.
+ */
+ if (cbt->btree->type != BTREE_ROW) {
+ WT_ASSERT(session, cbt->ins != NULL);
+ recno = WT_INSERT_RECNO(cbt->ins);
+ WT_ASSERT(session, recno != 0);
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ WT_ERR(__wt_logop_col_remove_pack(session, logrec,
+ op->fileid, recno));
+ else
+ WT_ERR(__wt_logop_col_put_pack(session, logrec,
+ op->fileid, recno, &value));
+ } else {
+ WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ WT_ERR(__wt_logop_row_remove_pack(session, logrec,
+ op->fileid, &key));
+ else
+ WT_ERR(__wt_logop_row_put_pack(session, logrec,
+ op->fileid, &key, &value));
+ }
+
+err: __wt_buf_free(session, &key);
+ return (ret);
+}
+
+/*
+ * __txn_commit_printlog --
+ * Print a commit log record.
+ */
+static int
+__txn_commit_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out)
+{
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__wt_txn_op_printlog(session, pp, end, out));
+ return (0);
+}
+
+/*
+ * __wt_txn_op_free --
+ * Free memory associated with a transactional operation.
+ */
+void
+__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op)
+{
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ case TXN_OP_INMEM:
+ case TXN_OP_REF:
+ case TXN_OP_TRUNCATE_COL:
+ break;
+
+ case TXN_OP_TRUNCATE_ROW:
+ __wt_buf_free(session, &op->u.truncate_row.start);
+ __wt_buf_free(session, &op->u.truncate_row.stop);
+ break;
+ }
+}
+
+/*
+ * __txn_logrec_init --
+ * Allocate and initialize a buffer for a transaction's log records.
+ */
+static int
+__txn_logrec_init(WT_SESSION_IMPL *session)
+{
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ WT_TXN *txn;
+ const char *fmt = WT_UNCHECKED_STRING(Iq);
+ uint32_t rectype = WT_LOGREC_COMMIT;
+ size_t header_size;
+
+ txn = &session->txn;
+ if (txn->logrec != NULL)
+ return (0);
+
+ WT_ASSERT(session, txn->id != WT_TXN_NONE);
+ WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ fmt, rectype, txn->id));
+ logrec->size += (uint32_t)header_size;
+ txn->logrec = logrec;
+
+ if (0) {
+err: __wt_logrec_free(session, &logrec);
+ }
+ return (ret);
+}
+
+/*
+ * __wt_txn_log_op --
+ * Write the last logged operation into the in-memory buffer.
+ */
+int
+__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ WT_ITEM *logrec;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+
+ if (!S2C(session)->logging || F_ISSET(session, WT_SESSION_NO_LOGGING))
+ return (0);
+
+ txn = &session->txn;
+
+ /* We'd better have a transaction. */
+ WT_ASSERT(session,
+ F_ISSET(txn, TXN_RUNNING) && F_ISSET(txn, TXN_HAS_ID));
+
+ WT_ASSERT(session, txn->mod_count > 0);
+ op = txn->mod + txn->mod_count - 1;
+
+ WT_RET(__txn_logrec_init(session));
+ logrec = txn->logrec;
+
+ switch (op->type) {
+ case TXN_OP_BASIC:
+ return (__txn_op_log(session, logrec, op, cbt));
+ case TXN_OP_INMEM:
+ case TXN_OP_REF:
+ /* Nothing to log, we're done. */
+ return (0);
+ case TXN_OP_TRUNCATE_COL:
+ return (__wt_logop_col_truncate_pack(session, logrec,
+ op->fileid,
+ op->u.truncate_col.start, op->u.truncate_col.stop));
+ case TXN_OP_TRUNCATE_ROW:
+ return (__wt_logop_row_truncate_pack(session, txn->logrec,
+ op->fileid,
+ &op->u.truncate_row.start, &op->u.truncate_row.stop,
+ (uint32_t)op->u.truncate_row.mode));
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * __wt_txn_log_commit --
+ * Write the operations of a transaction to the log at commit time.
+ */
+int
+__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_TXN *txn;
+
+ WT_UNUSED(cfg);
+ txn = &session->txn;
+
+ /* Write updates to the log. */
+ return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync));
+}
+
+/*
+ * __txn_log_file_sync --
+ * Write a log record for a file sync.
+ */
+static int
+__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp)
+{
+ WT_BTREE *btree;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ size_t header_size;
+ uint32_t rectype = WT_LOGREC_FILE_SYNC;
+ int start;
+ const char *fmt = WT_UNCHECKED_STRING(III);
+
+ btree = S2BT(session);
+ start = LF_ISSET(WT_TXN_LOG_CKPT_START);
+
+ WT_RET(__wt_struct_size(
+ session, &header_size, fmt, rectype, btree->id, start));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, header_size,
+ fmt, rectype, btree->id, start));
+ logrec->size += (uint32_t)header_size;
+
+ WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __wt_txn_checkpoint_logread --
+ * Read a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_logread(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end,
+ WT_LSN *ckpt_lsn)
+{
+ WT_ITEM ckpt_snapshot;
+ u_int ckpt_nsnapshot;
+ const char *fmt = WT_UNCHECKED_STRING(IQIU);
+
+ WT_RET(__wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
+ &ckpt_lsn->file, &ckpt_lsn->offset,
+ &ckpt_nsnapshot, &ckpt_snapshot));
+ WT_UNUSED(ckpt_nsnapshot);
+ WT_UNUSED(ckpt_snapshot);
+ *pp = end;
+ return (0);
+}
+
+/*
+ * __wt_txn_checkpoint_log --
+ * Write a log record for a checkpoint operation.
+ */
+int
+__wt_txn_checkpoint_log(
+ WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp)
+{
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ WT_LSN *ckpt_lsn;
+ WT_TXN *txn;
+ uint8_t *end, *p;
+ size_t recsize;
+ uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
+ const char *fmt = WT_UNCHECKED_STRING(IIQIU);
+
+ txn = &session->txn;
+ ckpt_lsn = &txn->ckpt_lsn;
+
+ /*
+ * If this is a file sync, log it unless there is a full checkpoint in
+ * progress.
+ */
+ if (!full) {
+ if (txn->full_ckpt) {
+ if (lsnp != NULL)
+ *lsnp = *ckpt_lsn;
+ return (0);
+ } else
+ return (__txn_log_file_sync(session, flags, lsnp));
+ }
+
+ switch (flags) {
+ case WT_TXN_LOG_CKPT_PREPARE:
+ txn->full_ckpt = 1;
+ *ckpt_lsn = S2C(session)->log->alloc_lsn;
+ break;
+
+ case WT_TXN_LOG_CKPT_START:
+ /* Take a copy of the transaction snapshot. */
+ txn->ckpt_nsnapshot = txn->snapshot_count;
+ recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
+ WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
+ p = txn->ckpt_snapshot->mem;
+ end = p + recsize;
+ for (i = 0; i < txn->snapshot_count; i++)
+ WT_ERR(__wt_vpack_uint(
+ &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
+ break;
+
+ case WT_TXN_LOG_CKPT_STOP:
+ /*
+ * During a clean connection close, we get here without the
+ * prepare or start steps. In that case, log the current LSN
+ * as the checkpoint LSN.
+ */
+ if (!txn->full_ckpt) {
+ txn->ckpt_nsnapshot = 0;
+ *ckpt_lsn = S2C(session)->log->alloc_lsn;
+ }
+
+ /* Write the checkpoint log record. */
+ WT_ERR(__wt_struct_size(session, &recsize, fmt,
+ rectype, ckpt_lsn->file, ckpt_lsn->offset,
+ txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+ WT_ERR(__wt_struct_pack(session,
+ (uint8_t *)logrec->data + logrec->size, recsize, fmt,
+ rectype, ckpt_lsn->file, ckpt_lsn->offset,
+ txn->ckpt_nsnapshot, &txn->ckpt_snapshot));
+ logrec->size += (uint32_t)recsize;
+ WT_ERR(__wt_log_write(session, logrec, lsnp, 0));
+
+ /*
+ * If this full checkpoint completed successfully and there is
+ * no hot backup in progress, tell the logging subsystem the
+ * checkpoint LSN so that it can archive.
+ */
+ if (!S2C(session)->hot_backup)
+ WT_ERR(__wt_log_ckpt(session, ckpt_lsn));
+
+ /* FALLTHROUGH */
+ case WT_TXN_LOG_CKPT_FAIL:
+ /* Cleanup any allocated resources */
+ INIT_LSN(ckpt_lsn);
+ txn->ckpt_nsnapshot = 0;
+ __wt_scr_free(&txn->ckpt_snapshot);
+ txn->full_ckpt = 0;
+ break;
+ }
+
+err: __wt_logrec_free(session, &logrec);
+ return (ret);
+}
+
+/*
+ * __wt_txn_truncate_log --
+ * Begin truncating a range of a file.
+ */
+int
+__wt_txn_truncate_log(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+{
+ WT_BTREE *btree;
+ WT_ITEM *item;
+ WT_TXN_OP *op;
+
+ btree = S2BT(session);
+
+ WT_RET(__txn_next_op(session, &op));
+
+ if (btree->type == BTREE_ROW) {
+ op->type = TXN_OP_TRUNCATE_ROW;
+ op->u.truncate_row.mode = TXN_TRUNC_ALL;
+ WT_CLEAR(op->u.truncate_row.start);
+ WT_CLEAR(op->u.truncate_row.stop);
+ if (start != NULL) {
+ op->u.truncate_row.mode = TXN_TRUNC_START;
+ item = &op->u.truncate_row.start;
+ WT_RET(__wt_cursor_get_raw_key(&start->iface, item));
+ WT_RET(__wt_buf_set(
+ session, item, item->data, item->size));
+ }
+ if (stop != NULL) {
+ op->u.truncate_row.mode =
+ (op->u.truncate_row.mode == TXN_TRUNC_ALL) ?
+ TXN_TRUNC_STOP : TXN_TRUNC_BOTH;
+ item = &op->u.truncate_row.stop;
+ WT_RET(__wt_cursor_get_raw_key(&stop->iface, item));
+ WT_RET(__wt_buf_set(
+ session, item, item->data, item->size));
+ }
+ } else {
+ op->type = TXN_OP_TRUNCATE_COL;
+ op->u.truncate_col.start =
+ (start == NULL) ? 0 : start->recno;
+ op->u.truncate_col.stop =
+ (stop == NULL) ? 0 : stop->recno;
+ }
+
+ /* Write that operation into the in-memory log. */
+ WT_RET(__wt_txn_log_op(session, NULL));
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM));
+ F_SET(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
+}
+
+/*
+ * __wt_txn_truncate_end --
+ * Finish truncating a range of a file.
+ */
+int
+__wt_txn_truncate_end(WT_SESSION_IMPL *session)
+{
+ F_CLR(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
+}
+
+/*
+ * __txn_printlog --
+ * Print a log record in a human-readable format.
+ */
+static int
+__txn_printlog(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ FILE *out;
+ WT_LSN ckpt_lsn;
+ uint64_t txnid;
+ uint32_t fileid, rectype;
+ int32_t start;
+ const uint8_t *end, *p;
+ const char *msg;
+
+ out = cookie;
+
+ p = LOG_SKIP_HEADER(logrec->data);
+ end = (const uint8_t *)logrec->data + logrec->size;
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ if (fprintf(out, " { \"lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+ lsnp->file, lsnp->offset) < 0)
+ return (errno);
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(IQ), &ckpt_lsn.file, &ckpt_lsn.offset));
+ if (fprintf(out, " \"type\" : \"checkpoint\"\n") < 0 ||
+ fprintf(
+ out, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRId64 "],\n",
+ ckpt_lsn.file, ckpt_lsn.offset) < 0)
+ return (errno);
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ if (fprintf(out, " \"type\" : \"commit\"\n") < 0 ||
+ fprintf(out, " \"txnid\" : %" PRIu64 ",\n", txnid) < 0)
+ return (errno);
+ WT_RET(__txn_commit_printlog(session, &p, end, out));
+ break;
+
+ case WT_LOGREC_FILE_SYNC:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(Ii), &fileid, &start));
+ if (fprintf(out, " \"type\" : \"file_sync\"\n") < 0 ||
+ fprintf(out, " \"fileid\" : %" PRIu32 "\n",
+ fileid) < 0 ||
+ fprintf(out, " \"start\" : %" PRId32 "\n", start) < 0)
+ return (errno);
+ break;
+
+ case WT_LOGREC_MESSAGE:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
+ WT_UNCHECKED_STRING(S), &msg));
+ if (fprintf(out, " \"type\" : \"message\"\n") < 0 ||
+ fprintf(out, " \"message\" : \"%s\"\n", msg) < 0)
+ return (errno);
+ break;
+ }
+
+ if (fprintf(out, " },\n") < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * __wt_txn_printlog --
+ * Print the log in a human-readable format.
+ */
+int
+__wt_txn_printlog(WT_SESSION *wt_session, FILE *out)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (fprintf(out, "[\n") < 0)
+ return (errno);
+ WT_RET(__wt_log_scan(
+ session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, out));
+ if (fprintf(out, "]\n") < 0)
+ return (errno);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
new file mode 100644
index 00000000000..38c606320ef
--- /dev/null
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -0,0 +1,491 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/* State maintained during recovery. */
+typedef struct {
+ WT_SESSION_IMPL *session;
+
+ /* Files from the metadata, indexed by file ID. */
+ struct WT_RECOVERY_FILE {
+ const char *uri; /* File URI. */
+ WT_CURSOR *c; /* Cursor used for recovery. */
+ WT_LSN ckpt_lsn; /* File's checkpoint LSN. */
+ } *files;
+ size_t file_alloc; /* Allocated size of files array. */
+ u_int max_fileid; /* Maximum file ID seen. */
+ u_int nfiles; /* Number of files in the metadata. */
+
+ WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */
+
+ int missing; /* Were there missing files? */
+ int modified; /* Did recovery make any changes? */
+ int metadata_only; /*
+ * Set during the first recovery pass,
+ * when only the metadata is recovered.
+ */
+} WT_RECOVERY;
+
+/*
+ * __recovery_cursor --
+ * Get a cursor for a recovery operation.
+ */
+static int
+__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
+ WT_LSN *lsnp, u_int id, int duplicate, WT_CURSOR **cp)
+{
+ WT_CURSOR *c;
+ const char *cfg[] = { WT_CONFIG_BASE(session, session_open_cursor),
+ "overwrite", NULL };
+ int metadata_op;
+
+ c = NULL;
+
+ /* Track the largest file ID we have seen. */
+ if (id > r->max_fileid)
+ r->max_fileid = id;
+
+ /*
+ * Metadata operations have an id of 0. Match operations based
+ * on the id and the current pass of recovery for metadata.
+ *
+ * Only apply operations in the correct metadata phase, and if the LSN
+ * is more recent than the last checkpoint. If there is no entry for a
+ * file, assume it was dropped or missing after a hot backup.
+ */
+ metadata_op = (id == WT_METAFILE_ID);
+ if (r->metadata_only != metadata_op)
+ ;
+ else if (id >= r->nfiles || r->files[id].uri == NULL) {
+ /* If a file is missing, output a verbose message once. */
+ if (!r->missing)
+ WT_RET(__wt_verbose(session, WT_VERB_RECOVERY,
+ "No file found with ID %u (max %u)",
+ id, r->nfiles));
+ r->missing = 1;
+ } else if (LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+ /*
+ * We're going to apply the operation. Get the cursor, opening
+ * one if none is cached.
+ */
+ if ((c = r->files[id].c) == NULL) {
+ WT_RET(__wt_open_cursor(
+ session, r->files[id].uri, NULL, cfg, &c));
+ r->files[id].c = c;
+ }
+ }
+
+ if (duplicate && c != NULL)
+ WT_RET(__wt_open_cursor(
+ session, r->files[id].uri, NULL, cfg, &c));
+
+ *cp = c;
+ return (0);
+}
+
+/*
+ * Helper to a cursor if this operation is to be applied during recovery.
+ */
+#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \
+ WT_ERR(__recovery_cursor( \
+ (session), (r), (lsnp), (fileid), 0, (cp))); \
+ WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY, \
+ "%s op %d to file %d at LSN %u/%" PRIuMAX, \
+ (cursor == NULL) ? "Skipping" : "Applying", \
+ optype, fileid, lsnp->file, (uintmax_t)lsnp->offset)); \
+ if (cursor == NULL) \
+ break
+
+/*
+ * __txn_op_apply --
+ * Apply a transactional operation during recovery.
+ */
+static int
+__txn_op_apply(
+ WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+ WT_CURSOR *cursor, *start, *stop;
+ WT_DECL_RET;
+ WT_ITEM key, start_key, stop_key, value;
+ WT_SESSION_IMPL *session;
+ uint64_t recno, start_recno, stop_recno;
+ uint32_t fileid, mode, optype, opsize;
+
+ session = r->session;
+ cursor = NULL;
+
+ /* Peek at the size and the type. */
+ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ switch (optype) {
+ case WT_LOGOP_COL_PUT:
+ WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
+ &fileid, &recno, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
+ &fileid, &recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
+ &fileid, &start_recno, &stop_recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+
+ /* Set up the cursors. */
+ if (start_recno == 0) {
+ start = NULL;
+ stop = cursor;
+ } else if (stop_recno == 0) {
+ start = cursor;
+ stop = NULL;
+ } else {
+ start = cursor;
+ WT_ERR(__recovery_cursor(
+ session, r, lsnp, fileid, 1, &stop));
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ start->set_key(start, start_recno);
+ if (stop != NULL)
+ stop->set_key(stop, stop_recno);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL,
+ start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
+ &fileid, &key, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
+ &fileid, &key));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
+ &fileid, &start_key, &stop_key, &mode));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ /* Set up the cursors. */
+ start = stop = NULL;
+ switch (mode) {
+ case TXN_TRUNC_ALL:
+ /* Both cursors stay NULL. */
+ break;
+ case TXN_TRUNC_BOTH:
+ start = cursor;
+ WT_ERR(__recovery_cursor(
+ session, r, lsnp, fileid, 1, &stop));
+ break;
+ case TXN_TRUNC_START:
+ start = cursor;
+ break;
+ case TXN_TRUNC_STOP:
+ stop = cursor;
+ break;
+
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ __wt_cursor_set_raw_key(start, &start_key);
+ if (stop != NULL)
+ __wt_cursor_set_raw_key(stop, &stop_key);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL,
+ start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Reset the cursor so it doesn't block eviction. */
+ if (cursor != NULL)
+ WT_ERR(cursor->reset(cursor));
+
+ r->modified = 1;
+
+err: if (ret != 0)
+ __wt_err(session, ret, "Operation failed during recovery");
+ return (ret);
+}
+
+/*
+ * __txn_commit_apply --
+ * Apply a commit record during recovery.
+ */
+static int
+__txn_commit_apply(
+ WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+{
+ WT_UNUSED(lsnp);
+
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__txn_op_apply(r, lsnp, pp, end));
+
+ return (0);
+}
+
+/*
+ * __txn_log_recover --
+ * Roll the log forward to recover committed changes.
+ */
+static int
+__txn_log_recover(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, void *cookie)
+{
+ WT_RECOVERY *r;
+ const uint8_t *end, *p;
+ uint64_t txnid;
+ uint32_t rectype;
+
+ r = cookie;
+ p = LOG_SKIP_HEADER(logrec->data);
+ end = (const uint8_t *)logrec->data + logrec->size;
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ if (r->metadata_only)
+ WT_RET(__wt_txn_checkpoint_logread(
+ session, &p, end, &r->ckpt_lsn));
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ WT_UNUSED(txnid);
+ WT_RET(__txn_commit_apply(r, lsnp, &p, end));
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * __recovery_setup_file --
+ * Set up the recovery slot for a file.
+ */
+static int
+__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
+{
+ WT_CONFIG_ITEM cval;
+ WT_LSN lsn;
+ uint32_t fileid;
+
+ WT_RET(__wt_config_getones(r->session, config, "id", &cval));
+ fileid = (uint32_t)cval.val;
+
+ if (r->nfiles <= fileid) {
+ WT_RET(__wt_realloc_def(
+ r->session, &r->file_alloc, fileid + 1, &r->files));
+ r->nfiles = fileid + 1;
+ }
+
+ WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
+ WT_RET(
+ __wt_config_getones(r->session, config, "checkpoint_lsn", &cval));
+ /* If there is checkpoint logged for the file, apply everything. */
+ if (cval.type != WT_CONFIG_ITEM_STRUCT)
+ INIT_LSN(&lsn);
+ else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")",
+ &lsn.file, (intmax_t*)&lsn.offset) != 2)
+ WT_RET_MSG(r->session, EINVAL,
+ "Failed to parse checkpoint LSN '%.*s'",
+ (int)cval.len, cval.str);
+ r->files[fileid].ckpt_lsn = lsn;
+
+ WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY,
+ "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")",
+ uri, fileid, lsn.file, lsn.offset));
+
+ return (0);
+
+}
+
+/*
+ * __recovery_free --
+ * Free the recovery state.
+ */
+static int
+__recovery_free(WT_RECOVERY *r)
+{
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = r->session;
+ for (i = 0; i < r->nfiles; i++) {
+ __wt_free(session, r->files[i].uri);
+ if ((c = r->files[i].c) != NULL)
+ WT_TRET(c->close(c));
+ }
+
+ __wt_free(session, r->files);
+ return (ret);
+}
+
+/*
+ * __recovery_file_scan --
+ * Scan the files referenced from the metadata and gather information
+ * about them for recovery.
+ */
+static int
+__recovery_file_scan(WT_RECOVERY *r)
+{
+ WT_DECL_RET;
+ WT_CURSOR *c;
+ const char *uri, *config;
+ int cmp;
+
+ /* Scan through all files in the metadata. */
+ c = r->files[0].c;
+ c->set_key(c, "file:");
+ if ((ret = c->search_near(c, &cmp)) != 0) {
+ /* Is the metadata empty? */
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+ if (cmp < 0)
+ WT_ERR_NOTFOUND_OK(c->next(c));
+ for (; ret == 0; ret = c->next(c)) {
+ WT_ERR(c->get_key(c, &uri));
+ if (!WT_PREFIX_MATCH(uri, "file:"))
+ break;
+ WT_ERR(c->get_value(c, &config));
+ WT_ERR(__recovery_setup_file(r, uri, config));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: if (r->nfiles > r->max_fileid)
+ r->max_fileid = r->nfiles;
+ return (ret);
+}
+
+/*
+ * __wt_txn_recover --
+ * Run recovery.
+ */
+int
+__wt_txn_recover(WT_CONNECTION_IMPL *conn)
+{
+ WT_CURSOR *metac;
+ WT_DECL_RET;
+ WT_RECOVERY r;
+ WT_SESSION_IMPL *session;
+ struct WT_RECOVERY_FILE *metafile;
+ const char *config;
+ int was_backup;
+
+ WT_CLEAR(r);
+ INIT_LSN(&r.ckpt_lsn);
+ was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;
+
+ /* We need a real session for recovery. */
+ WT_RET(__wt_open_session(conn, NULL, NULL, &session));
+ F_SET(session, WT_SESSION_NO_LOGGING);
+ r.session = session;
+
+ WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
+ WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
+ WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
+ metafile = &r.files[WT_METAFILE_ID];
+ metafile->c = metac;
+
+ /*
+ * First, do a pass through the log to recover the metadata, and
+ * establish the last checkpoint LSN. Skip this when opening a hot
+ * backup: we already have the correct metadata in that case.
+ */
+ if (!was_backup) {
+ r.metadata_only = 1;
+ if (IS_INIT_LSN(&metafile->ckpt_lsn))
+ WT_ERR(__wt_log_scan(session,
+ NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
+ else
+ WT_ERR(__wt_log_scan(session,
+ &metafile->ckpt_lsn, 0, __txn_log_recover, &r));
+
+ WT_ASSERT(session,
+ LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0);
+ }
+
+ /* Scan the metadata to find the live files and their IDs. */
+ WT_ERR(__recovery_file_scan(&r));
+
+ /*
+ * We no longer need the metadata cursor: close it to avoid pinning any
+ * resources that could block eviction during recovery.
+ */
+ r.files[0].c = NULL;
+ WT_ERR(metac->close(metac));
+
+ /*
+ * Now, recover all the files apart from the metadata.
+ * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
+ */
+ r.metadata_only = 0;
+ WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
+ "Main recovery loop: starting at %u/%" PRIuMAX,
+ r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
+ if (IS_INIT_LSN(&r.ckpt_lsn))
+ WT_ERR(__wt_log_scan(session, NULL,
+ WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
+ __txn_log_recover, &r));
+ else
+ WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
+ WT_LOGSCAN_RECOVER,
+ __txn_log_recover, &r));
+
+ conn->next_file_id = r.max_fileid;
+
+ /*
+ * If recovery ran successfully forcibly log a checkpoint so the next
+ * open is fast and keep the metadata up to date with the checkpoint
+ * LSN and archiving.
+ */
+ WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+
+err: WT_TRET(__recovery_free(&r));
+ __wt_free(session, config);
+ WT_TRET(session->iface.close(&session->iface, NULL));
+
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util.h b/src/third_party/wiredtiger/src/utilities/util.h
new file mode 100644
index 00000000000..1f2f0b7211a
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include <wt_internal.h>
+
+typedef struct {
+ void *mem; /* Managed memory chunk */
+ size_t memsize; /* Managed memory size */
+} ULINE;
+
+extern const char *home; /* Home directory */
+extern const char *progname; /* Program name */
+extern const char *usage_prefix; /* Global arguments */
+extern int verbose; /* Verbose flag */
+
+extern WT_EVENT_HANDLER *verbose_handler;
+
+extern int __wt_opterr; /* if error message should be printed */
+extern int __wt_optind; /* index into parent argv vector */
+extern int __wt_optopt; /* character checked for validity */
+extern int __wt_optreset; /* reset getopt */
+extern char *__wt_optarg; /* argument associated with option */
+
+int util_backup(WT_SESSION *, int, char *[]);
+int util_cerr(const char *, const char *, int);
+int util_compact(WT_SESSION *, int, char *[]);
+void util_copyright(void);
+int util_create(WT_SESSION *, int, char *[]);
+int util_drop(WT_SESSION *, int, char *[]);
+int util_dump(WT_SESSION *, int, char *[]);
+int util_err(int, const char *, ...);
+int util_flush(WT_SESSION *, const char *);
+int util_list(WT_SESSION *, int, char *[]);
+int util_load(WT_SESSION *, int, char *[]);
+int util_loadtext(WT_SESSION *, int, char *[]);
+char *util_name(const char *, const char *);
+int util_printlog(WT_SESSION *, int, char *[]);
+int util_read(WT_SESSION *, int, char *[]);
+int util_read_line(ULINE *, int, int *);
+int util_rename(WT_SESSION *, int, char *[]);
+int util_salvage(WT_SESSION *, int, char *[]);
+int util_stat(WT_SESSION *, int, char *[]);
+int util_str2recno(const char *p, uint64_t *recnop);
+int util_upgrade(WT_SESSION *, int, char *[]);
+int util_verify(WT_SESSION *, int, char *[]);
+int util_write(WT_SESSION *, int, char *[]);
diff --git a/src/third_party/wiredtiger/src/utilities/util_backup.c b/src/third_party/wiredtiger/src/utilities/util_backup.c
new file mode 100644
index 00000000000..aa61cc338f0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_backup.c
@@ -0,0 +1,205 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int copy(const char *, const char *);
+static int usage(void);
+
+#define CBUF_LEN (128 * 1024) /* Copy buffer and size. */
+static char *cbuf;
+
+/*
+ * append_target --
+ * Build a list of comma-separated targets.
+ */
+static int
+append_target(const char *target, char **bufp)
+{
+ static int first = 1;
+ static size_t len = 0, remain = 0;
+ static char *buf = NULL;
+
+ /* 20 bytes of slop */
+ if (remain < strlen(target) + 20) {
+ len += strlen(target) + 512;
+ remain += strlen(target) + 512;
+ if ((buf = realloc(buf, len)) == NULL)
+ return (util_err(errno, NULL));
+ *bufp = buf;
+ }
+ if (first) {
+ first = 0;
+ strcpy(buf, "target=(");
+ } else
+ buf[strlen(buf) - 1] = ','; /* overwrite previous ")" */
+ strcat(buf, "\"");
+ strcat(buf, target);
+ strcat(buf, "\")");
+ remain -= strlen(target) + 1;
+
+ return (0);
+}
+
+int
+util_backup(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int ch;
+ char *config;
+ const char *directory, *name;
+
+ config = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "t:")) != EOF)
+ switch (ch) {
+ case 't':
+ if (append_target(__wt_optarg, &config))
+ return (1);
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ if (argc != 1) {
+ (void)usage();
+ goto err;
+ }
+ directory = *argv;
+
+ if ((ret = session->open_cursor(
+ session, "backup:", NULL, config, &cursor)) != 0) {
+ fprintf(stderr, "%s: cursor open(backup:) failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Copy the files. */
+ while (
+ (ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_key(cursor, &name)) == 0)
+ if ((ret = copy(name, directory)) != 0)
+ goto err;
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (ret != 0) {
+ fprintf(stderr, "%s: cursor next(backup:) failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+err: if (config != NULL)
+ free(config);
+ if (cbuf != NULL)
+ free(cbuf);
+
+ return (ret);
+}
+
+static int
+copy(const char *name, const char *directory)
+{
+ WT_DECL_RET;
+ ssize_t n;
+ int ifd, ofd;
+
+ ret = 1;
+ ifd = ofd = -1;
+
+ if (verbose &&
+ printf("Backing up %s/%s to %s\n", home, name, directory) < 0) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ return (1);
+ }
+
+ /* Allocate a large copy buffer (use it to build pathnames as well. */
+ if (cbuf == NULL && (cbuf = malloc(CBUF_LEN)) == NULL)
+ goto memerr;
+
+ /* Open the read file. */
+ if (snprintf(cbuf, CBUF_LEN, "%s/%s", home, name) >= CBUF_LEN)
+ goto memerr;
+ if ((ifd = open(cbuf, O_BINARY | O_RDONLY, 0)) < 0)
+ goto readerr;
+
+ /* Open the write file. */
+ if (snprintf(cbuf, CBUF_LEN, "%s/%s", directory, name) >= CBUF_LEN)
+ goto memerr;
+ if ((ofd = open(
+ cbuf, O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, 0666)) < 0)
+ goto writerr;
+
+ /* Copy the file. */
+ while ((n = read(ifd, cbuf, CBUF_LEN)) > 0)
+ if (write(ofd, cbuf, (size_t)n) != n)
+ goto writerr;
+ if (n != 0)
+ goto readerr;
+
+ /*
+ * Close file descriptors (forcing a flush on the write side), and
+ * check for any errors.
+ */
+ ret = close(ifd);
+ ifd = -1;
+ if (ret != 0)
+ goto readerr;
+
+ /*
+ * We need to know this file was successfully written, it's a backup.
+ */
+#ifdef _WIN32
+ if (FlushFileBuffers((HANDLE)_get_osfhandle(ofd)) == 0) {
+ DWORD err = GetLastError();
+ ret = err;
+ goto writerr;
+ }
+#else
+ if (fsync(ofd))
+ goto writerr;
+#endif
+ ret = close(ofd);
+ ofd = -1;
+ if (ret != 0)
+ goto writerr;
+
+ /* Success. */
+ ret = 0;
+
+ if (0) {
+readerr: fprintf(stderr,
+ "%s: %s/%s: %s\n", progname, home, name, strerror(errno));
+ }
+ if (0) {
+writerr: fprintf(stderr, "%s: %s/%s: %s\n",
+ progname, directory, name, strerror(errno));
+ }
+ if (0) {
+memerr: fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ }
+
+ if (ifd >= 0)
+ (void)close(ifd);
+ if (ofd >= 0)
+ (void)close(ofd);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "backup [-t uri] directory\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_compact.c b/src/third_party/wiredtiger/src/utilities/util_compact.c
new file mode 100644
index 00000000000..51d5461e43c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_compact.c
@@ -0,0 +1,59 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_compact(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *uri;
+
+ uri = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->compact(session, uri, NULL)) != 0) {
+ fprintf(stderr, "%s: compact(%s): %s\n",
+ progname, uri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (uri != NULL)
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "compact uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_cpyright.c b/src/third_party/wiredtiger/src/utilities/util_cpyright.c
new file mode 100644
index 00000000000..21d82828863
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_cpyright.c
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+void
+util_copyright(void)
+{
+ printf("%s\n", "Copyright (c) 2008-2014 WiredTiger, Inc.");
+ printf("%s\n\n", "All rights reserved.");
+
+ printf("%s\n\n",
+ "This program is free software: you can redistribute it and/or\n"
+ "modify it under the terms of version 3 of the GNU General\n"
+ "Public License as published by the Free Software Foundation.");
+
+ printf("%s\n\n",
+ "This program is distributed in the hope that it will be useful,\n"
+ "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+ "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
+ "GNU General Public License for more details:");
+
+ printf("\t%s\n\n",
+ "http://www.gnu.org/licenses/gpl-3.0-standalone.html");
+
+ printf("%s\n",
+ "For a license to use the WiredTiger software under conditions\n"
+ "other than those described by the GNU General Public License,\n"
+ "or for technical support for this software, contact WiredTiger,\n"
+ "Inc. at info@wiredtiger.com.");
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_create.c b/src/third_party/wiredtiger/src/utilities/util_create.c
new file mode 100644
index 00000000000..ebff3a8ad05
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_create.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_create(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ const char *config, *uri;
+
+ config = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF)
+ switch (ch) {
+ case 'c': /* command-line configuration */
+ config = __wt_optarg;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the uri. */
+ if (argc != 1)
+ return (usage());
+
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->create(session, uri, config)) != 0)
+ return (util_err(ret, "%s: session.create", uri));
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "create [-c configuration] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_drop.c b/src/third_party/wiredtiger/src/utilities/util_drop.c
new file mode 100644
index 00000000000..6fe416882a3
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_drop.c
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_drop(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *name;
+
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the uri. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ ret = session->drop(session, name, "force");
+
+ if (name != NULL)
+ free(name);
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "drop uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_dump.c b/src/third_party/wiredtiger/src/utilities/util_dump.c
new file mode 100644
index 00000000000..bd0590948b4
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_dump.c
@@ -0,0 +1,701 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int dump_config(WT_SESSION *, const char *, int);
+static int dump_json_begin(void);
+static int dump_json_end(void);
+static int dump_json_separator(void);
+static int dump_json_table_begin(WT_CURSOR *, const char *, const char *);
+static int dump_json_table_cg(WT_CURSOR *, const char *, const char *,
+ const char *, const char *);
+static int dump_json_table_config(WT_SESSION *, const char *);
+static int dump_json_table_end(void);
+static int dump_prefix(int);
+static int dump_record(WT_CURSOR *, const char *, int, int);
+static int dump_suffix(void);
+static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *);
+static int dump_table_config_type(WT_SESSION *,
+ WT_CURSOR *, WT_CURSOR *, const char *, const char *, const char *);
+static int dup_json_string(const char *, char **);
+static int print_config(WT_SESSION *, const char *, const char *, const char *);
+static int usage(void);
+
+int
+util_dump(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ size_t len;
+ int ch, hex, i, json, reverse;
+ char *checkpoint, *config, *name;
+
+ hex = json = reverse = 0;
+ checkpoint = config = name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF)
+ switch (ch) {
+ case 'c':
+ checkpoint = __wt_optarg;
+ break;
+ case 'f': /* output file */
+ if (freopen(__wt_optarg, "w", stdout) == NULL)
+ return (
+ util_err(errno, "%s: reopen", __wt_optarg));
+ break;
+ case 'j':
+ json = 1;
+ break;
+ case 'r':
+ reverse = 1;
+ break;
+ case 'x':
+ hex = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* -j and -x are incompatible. */
+ if (hex && json) {
+ fprintf(stderr,
+ "%s: the -j and -x dump options are incompatible\n",
+ progname);
+ goto err;
+ }
+
+ /* The remaining argument is the uri. */
+ if (argc < 1 || (argc != 1 && !json))
+ return (usage());
+
+ if (json && (ret = dump_json_begin()) != 0)
+ goto err;
+
+ for (i = 0; i < argc; i++) {
+ if (json && i > 0)
+ if ((ret = dump_json_separator()) != 0)
+ goto err;
+ if (name != NULL) {
+ free(name);
+ name = NULL;
+ }
+ if ((name = util_name(argv[i], "table")) == NULL)
+ goto err;
+
+ if (json && dump_json_table_config(session, name) != 0)
+ goto err;
+ if (!json && dump_config(session, name, hex) != 0)
+ goto err;
+
+ len =
+ checkpoint == NULL ? 0 : strlen("checkpoint=") +
+ strlen(checkpoint) + 1;
+ len += strlen(json ? "dump=json" :
+ (hex ? "dump=hex" : "dump=print"));
+ if ((config = malloc(len + 10)) == NULL)
+ goto err;
+ if (checkpoint == NULL)
+ config[0] = '\0';
+ else {
+ (void)strcpy(config, "checkpoint=");
+ (void)strcat(config, checkpoint);
+ (void)strcat(config, ",");
+ }
+ (void)strcat(config, json ? "dump=json" :
+ (hex ? "dump=hex" : "dump=print"));
+ if ((ret = session->open_cursor(
+ session, name, NULL, config, &cursor)) != 0) {
+ fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if ((ret = dump_record(cursor, name, reverse, json)) != 0)
+ goto err;
+ if (json && (ret = dump_json_table_end()) != 0)
+ goto err;
+ }
+ if (json && ((ret = dump_json_end()) != 0))
+ goto err;
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (config != NULL)
+ free(config);
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+/*
+ * dump_config --
+ * Dump the config for the uri.
+ */
+static int
+dump_config(WT_SESSION *session, const char *uri, int hex)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int tret;
+
+ /* Open a metadata cursor. */
+ if ((ret = session->open_cursor(
+ session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+ fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+ progname, WT_METADATA_URI, wiredtiger_strerror(ret));
+ return (1);
+ }
+ /*
+ * Search for the object itself, just to make sure it exists, we don't
+ * want to output a header if the user entered the wrong name. This is
+ * where we find out a table doesn't exist, use a simple error message.
+ */
+ cursor->set_key(cursor, uri);
+ if ((ret = cursor->search(cursor)) == 0) {
+ if (dump_prefix(hex) != 0 ||
+ dump_table_config(session, cursor, uri) != 0 ||
+ dump_suffix() != 0)
+ ret = 1;
+ } else if (ret == WT_NOTFOUND)
+ ret = util_err(0, "%s: No such object exists", uri);
+ else
+ ret = util_err(ret, "%s", uri);
+
+ if ((tret = cursor->close(cursor)) != 0) {
+ tret = util_cerr(uri, "close", tret);
+ if (ret == 0)
+ ret = tret;
+ }
+
+ return (ret);
+}
+
+/*
+ * dump_json_begin --
+ * Output the dump file header prefix.
+ */
+static int
+dump_json_begin(void)
+{
+ if (printf("{\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_json_end --
+ * Output the dump file header suffix.
+ */
+static int
+dump_json_end(void)
+{
+ if (printf("\n}\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_json_begin --
+ * Output the dump file header prefix.
+ */
+static int
+dump_json_separator(void)
+{
+ if (printf(",\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_json_table_begin --
+ * Output the JSON syntax that starts a table, along with its config.
+ */
+static int
+dump_json_table_begin(WT_CURSOR *cursor, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+ const char *name;
+ char *jsonconfig;
+
+ jsonconfig = NULL;
+
+ /* Get the table name. */
+ if ((name = strchr(uri, ':')) == NULL) {
+ fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+ return (1);
+ }
+ ++name;
+
+ if ((ret = dup_json_string(config, &jsonconfig)) != 0)
+ return (util_cerr(uri, "config dup", ret));
+ if (printf(" \"%s\" : [\n {\n", uri) < 0)
+ goto eio;
+ if (printf(" \"config\" : \"%s\",\n", jsonconfig) < 0)
+ goto eio;
+
+ if ((ret = dump_json_table_cg(
+ cursor, uri, name, "colgroup:", "colgroups")) == 0) {
+ if (printf(",\n") < 0)
+ goto eio;
+ ret =
+ dump_json_table_cg(cursor, uri, name, "index:", "indices");
+ }
+
+ if (printf("\n },\n {\n \"data\" : [") < 0)
+ goto eio;
+
+ if (0) {
+eio: ret = util_err(EIO, NULL);
+ }
+
+ free(jsonconfig);
+ return (ret);
+}
+
+/*
+ * dump_json_table_cg --
+ * Dump the column groups or indices for a table.
+ */
+static int
+dump_json_table_cg(WT_CURSOR *cursor,
+ const char *uri, const char *name, const char *entry, const char *header)
+{
+ WT_DECL_RET;
+ const char *key, *skip, *value;
+ int exact, once;
+ char *jsonconfig;
+ static const char * const indent = " ";
+
+ once = 0;
+ if (printf(" \"%s\" : [", header) < 0)
+ return (util_err(EIO, NULL));
+
+ /*
+ * For table dumps, we're done.
+ */
+ if (cursor == NULL) {
+ if (printf("]") < 0)
+ return (util_err(EIO, NULL));
+ else
+ return (0);
+ }
+
+ /*
+ * Search the file looking for column group and index key/value pairs:
+ * for each one, look up the related source information and append it
+ * to the base record.
+ */
+ cursor->set_key(cursor, entry);
+ if ((ret = cursor->search_near(cursor, &exact)) != 0) {
+ if (ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "search_near", ret));
+ }
+ if (exact >= 0)
+ goto match;
+ while ((ret = cursor->next(cursor)) == 0) {
+match: if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(uri, "get_key", ret));
+
+ /* Check if we've finished the list of entries. */
+ if (!WT_PREFIX_MATCH(key, entry))
+ break;
+
+ /* Check for a table name match. */
+ skip = key + strlen(entry);
+ if (strncmp(
+ skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+ continue;
+
+ /* Get the value. */
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+
+ if ((ret = dup_json_string(value, &jsonconfig)) != 0)
+ return (util_cerr(uri, "config dup", ret));
+ ret = printf("%s\n"
+ "%s{\n"
+ "%s \"uri\" : \"%s\",\n"
+ "%s \"config\" : \"%s\"\n"
+ "%s}",
+ (once == 0 ? "" : ","),
+ indent, indent, key, indent, jsonconfig, indent);
+ free(jsonconfig);
+ if (ret < 0)
+ return (util_err(EIO, NULL));
+
+ once = 1;
+ }
+ if (printf("%s]", (once == 0 ? "" : "\n ")) < 0)
+ return (util_err(EIO, NULL));
+ if (ret == 0 || ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "next", ret));
+}
+
+/*
+ * dump_json_table_config --
+ * Dump the config for the uri.
+ */
+static int
+dump_json_table_config(WT_SESSION *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_EXTENSION_API *wtext;
+ int tret;
+ const char *value;
+
+ /* Dump the config. */
+ if (WT_PREFIX_MATCH(uri, "table:")) {
+ /* Open a metadata cursor. */
+ if ((ret = session->open_cursor(
+ session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+ fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+ progname, WT_METADATA_URI,
+ wiredtiger_strerror(ret));
+ return (1);
+ }
+
+ /*
+ * Search for the object itself, to make sure it
+ * exists, and get its config string. This where we
+ * find out a table object doesn't exist, use a simple
+ * error message.
+ */
+ cursor->set_key(cursor, uri);
+ if ((ret = cursor->search(cursor)) == 0) {
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ ret = util_cerr(uri, "get_value", ret);
+ else if (dump_json_table_begin(cursor, uri,
+ value) != 0)
+ ret = 1;
+ } else if (ret == WT_NOTFOUND)
+ ret = util_err(0, "%s: No such object exists", uri);
+ else
+ ret = util_err(ret, "%s", uri);
+
+ if ((tret = cursor->close(cursor)) != 0) {
+ tret = util_cerr(uri, "close", tret);
+ if (ret == 0)
+ ret = tret;
+ }
+ } else {
+ /*
+ * We want to be able to dump the metadata file itself, but the
+ * configuration for that file lives in the turtle file. Reach
+ * down into the library and ask for the file's configuration,
+ * that will work in all cases.
+ *
+ * This where we find out a file object doesn't exist, use a
+ * simple error message.
+ */
+ wtext = session->
+ connection->get_extension_api(session->connection);
+ if ((ret =
+ wtext->metadata_search(wtext, session, uri, &value)) == 0) {
+ if (dump_json_table_begin(NULL, uri, value) != 0)
+ ret = 1;
+ } else if (ret == WT_NOTFOUND)
+ ret = util_err(0, "%s: No such object exists", uri);
+ else
+ ret = util_err(ret, "%s", uri);
+ }
+
+ return (ret);
+}
+
+/*
+ * dump_json_table_end --
+ * Output the JSON syntax that ends a table.
+ */
+static int
+dump_json_table_end(void)
+{
+ if (printf(" ]\n }\n ]") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_table_config --
+ * Dump the config for a table.
+ */
+static int
+dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
+{
+ WT_CURSOR *srch;
+ WT_DECL_RET;
+ int tret;
+ const char *key, *name, *value;
+
+ /* Get the table name. */
+ if ((name = strchr(uri, ':')) == NULL) {
+ fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
+ return (1);
+ }
+ ++name;
+
+ /*
+ * Dump out the config information: first, dump the uri entry itself
+ * (requires a lookup).
+ */
+ cursor->set_key(cursor, uri);
+ if ((ret = cursor->search(cursor)) != 0)
+ return (util_cerr(uri, "search", ret));
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(uri, "get_key", ret));
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+ if (print_config(session, key, value, NULL) != 0)
+ return (1);
+
+ /*
+ * The underlying table configuration function needs a second cursor:
+ * open one before calling it, it makes error handling hugely simpler.
+ */
+ if ((ret =
+ session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0)
+ return (util_cerr(uri, "open_cursor", ret));
+
+ if ((ret = dump_table_config_type(
+ session, cursor, srch, uri, name, "colgroup:")) == 0)
+ ret = dump_table_config_type(
+ session, cursor, srch, uri, name, "index:");
+
+ if ((tret = srch->close(srch)) != 0) {
+ tret = util_cerr(uri, "close", tret);
+ if (ret == 0)
+ ret = tret;
+ }
+
+ return (ret);
+}
+
+/*
+ * dump_table_config_type --
+ * Dump the column groups or indices for a table.
+ */
+static int
+dump_table_config_type(WT_SESSION *session,
+ WT_CURSOR *cursor, WT_CURSOR *srch,
+ const char *uri, const char *name, const char *entry)
+{
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ const char *key, *skip, *value, *value_source;
+ int exact;
+ char *p;
+
+ /*
+ * Search the file looking for column group and index key/value pairs:
+ * for each one, look up the related source information and append it
+ * to the base record.
+ */
+ cursor->set_key(cursor, entry);
+ if ((ret = cursor->search_near(cursor, &exact)) != 0) {
+ if (ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "search_near", ret));
+ }
+ if (exact >= 0)
+ goto match;
+ while ((ret = cursor->next(cursor)) == 0) {
+match: if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(uri, "get_key", ret));
+
+ /* Check if we've finished the list of entries. */
+ if (!WT_PREFIX_MATCH(key, entry))
+ return (0);
+
+ /* Check for a table name match. */
+ skip = key + strlen(entry);
+ if (strncmp(
+ skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
+ continue;
+
+ /* Get the value. */
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+
+ /* Crack it and get the underlying source. */
+ if ((ret = __wt_config_getones(
+ (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0)
+ return (util_err(ret, "%s: source entry", key));
+
+ /* Nul-terminate the source entry. */
+ if ((p = malloc(cval.len + 10)) == NULL)
+ return (util_err(errno, NULL));
+ (void)strncpy(p, cval.str, cval.len);
+ p[cval.len] = '\0';
+ srch->set_key(srch, p);
+ if ((ret = srch->search(srch)) != 0)
+ ret = util_err(ret, "%s: %s", key, p);
+ free(p);
+ if (ret != 0)
+ return (1);
+
+ /* Get the source's value. */
+ if ((ret = srch->get_value(srch, &value_source)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+
+ /*
+ * The dumped configuration string is the original key plus the
+ * source's configuration.
+ */
+ if (print_config(session, key, value, value_source) != 0)
+ return (util_err(EIO, NULL));
+ }
+ if (ret == 0 || ret == WT_NOTFOUND)
+ return (0);
+ return (util_cerr(uri, "next", ret));
+}
+
+/*
+ * dump_prefix --
+ * Output the dump file header prefix.
+ */
+static int
+dump_prefix(int hex)
+{
+ int vmajor, vminor, vpatch;
+
+ (void)wiredtiger_version(&vmajor, &vminor, &vpatch);
+
+ if (printf(
+ "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n",
+ vmajor, vminor, vpatch) < 0 ||
+ printf("Format=%s\n", hex ? "hex" : "print") < 0 ||
+ printf("Header\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dump_record --
+ * Dump a single record, advance cursor to next/prev, along
+ * with JSON formatting if needed.
+ */
+static int
+dump_record(WT_CURSOR *cursor, const char *name, int reverse, int json)
+{
+ WT_DECL_RET;
+ const char *infix, *key, *prefix, *suffix, *value;
+ int once;
+
+ once = 0;
+ if (json) {
+ prefix = "\n{\n";
+ infix = ",\n";
+ suffix = "\n}";
+ } else {
+ prefix = "";
+ infix = "\n";
+ suffix = "\n";
+ }
+ while ((ret =
+ (reverse ? cursor->prev(cursor) : cursor->next(cursor))) == 0) {
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr(name, "get_key", ret));
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(name, "get_value", ret));
+ if (printf("%s%s%s%s%s%s", (json && once) ? "," : "",
+ prefix, key, infix, value, suffix) < 0)
+ return (util_err(EIO, NULL));
+ once = 1;
+ }
+ if (json && once && printf("\n") < 0)
+ return (util_err(EIO, NULL));
+ return (ret == WT_NOTFOUND ? 0 :
+ util_cerr(name, (reverse ? "prev" : "next"), ret));
+}
+
+/*
+ * dump_suffix --
+ * Output the dump file header suffix.
+ */
+static int
+dump_suffix(void)
+{
+ if (printf("Data\n") < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+/*
+ * dup_json_string --
+ * Like strdup, but escape any characters that are special for JSON.
+ * The result will be embedded in a JSON string.
+ */
+static int
+dup_json_string(const char *str, char **result)
+{
+ size_t left, nchars;
+ const char *p;
+ char *q;
+
+ nchars = 0;
+ for (p = str; *p; p++, nchars++)
+ nchars += __wt_json_unpack_char(*p, NULL, 0, 0);
+ q = malloc(nchars + 1);
+ if (q == NULL)
+ return (1);
+ *result = q;
+ left = nchars;
+ for (p = str; *p; p++, nchars++) {
+ nchars = __wt_json_unpack_char(*p, (u_char *)q, left, 0);
+ left -= nchars;
+ q += nchars;
+ }
+ *q = '\0';
+ return (0);
+}
+
+/*
+ * print_config --
+ * Output a key/value URI pair by combining v1 and v2.
+ */
+static int
+print_config(WT_SESSION *session,
+ const char *key, const char *v1, const char *v2)
+{
+ WT_DECL_RET;
+ const char *value_ret;
+
+ /*
+ * The underlying call will ignore v2 if v1 is NULL -- check here and
+ * swap in that case.
+ */
+ if (v1 == NULL) {
+ v1 = v2;
+ v2 = NULL;
+ }
+
+ if ((ret = __wt_session_create_strip(session, v1, v2, &value_ret)) != 0)
+ return (util_err(ret, NULL));
+ ret = printf("%s\n%s\n", key, value_ret);
+ free((char *)value_ret);
+ if (ret < 0)
+ return (util_err(EIO, NULL));
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "dump [-jrx] [-c checkpoint] [-f output-file] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_list.c b/src/third_party/wiredtiger/src/utilities/util_list.c
new file mode 100644
index 00000000000..4a1489628d1
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_list.c
@@ -0,0 +1,193 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int list_print(WT_SESSION *, const char *, int, int);
+static int list_print_checkpoint(WT_SESSION *, const char *);
+static int usage(void);
+
+int
+util_list(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int cflag, ch, vflag;
+ char *name;
+
+ cflag = vflag = 0;
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF)
+ switch (ch) {
+ case 'c':
+ cflag = 1;
+ break;
+ case 'v':
+ vflag = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ switch (argc) {
+ case 0:
+ break;
+ case 1:
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+ break;
+ default:
+ return (usage());
+ }
+
+ ret = list_print(session, name, cflag, vflag);
+
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+/*
+ * list_print --
+ * List the high-level objects in the database.
+ */
+static int
+list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int found;
+ const char *key, *value;
+
+ /* Open the metadata file. */
+ if ((ret = session->open_cursor(
+ session, WT_METADATA_URI, NULL, NULL, &cursor)) != 0) {
+ /*
+ * If there is no metadata (yet), this will return ENOENT.
+ * Treat that the same as an empty metadata.
+ */
+ if (ret == ENOENT)
+ return (0);
+
+ fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
+ progname, WT_METADATA_URI, wiredtiger_strerror(ret));
+ return (1);
+ }
+
+ found = name == NULL;
+ while ((ret = cursor->next(cursor)) == 0) {
+ /* Get the key. */
+ if ((ret = cursor->get_key(cursor, &key)) != 0)
+ return (util_cerr("metadata", "get_key", ret));
+
+ /*
+ * If a name is specified, only show objects that match.
+ */
+ if (name != NULL) {
+ if (!WT_PREFIX_MATCH(key, name))
+ continue;
+ found = 1;
+ }
+
+ /*
+ * XXX
+ * We don't normally say anything about the WiredTiger
+ * metadata, it's not a normal "object" in the database. I'm
+ * making an exception for the checkpoint and verbose options.
+ */
+ if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag)
+ printf("%s\n", key);
+
+ if (!cflag && !vflag)
+ continue;
+
+ if (cflag && (ret = list_print_checkpoint(session, key)) != 0)
+ return (ret);
+ if (vflag) {
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (
+ util_cerr("metadata", "get_value", ret));
+ printf("%s\n", value);
+ }
+ }
+ if (ret != WT_NOTFOUND)
+ return (util_cerr("metadata", "next", ret));
+ if (!found) {
+ fprintf(stderr, "%s: %s: not found\n", progname, name);
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * list_print_checkpoint --
+ * List the checkpoint information.
+ */
+static int
+list_print_checkpoint(WT_SESSION *session, const char *key)
+{
+ WT_DECL_RET;
+ WT_CKPT *ckpt, *ckptbase;
+ size_t len;
+ time_t t;
+ uint64_t v;
+
+ /*
+ * We may not find any checkpoints for this file, in which case we don't
+ * report an error, and continue our caller's loop. Otherwise, read the
+ * list of checkpoints and print each checkpoint's name and time.
+ */
+ if ((ret = __wt_metadata_get_ckptlist(session, key, &ckptbase)) != 0)
+ return (ret == WT_NOTFOUND ? 0 : ret);
+
+ /* Find the longest name, so we can pretty-print. */
+ len = 0;
+ WT_CKPT_FOREACH(ckptbase, ckpt)
+ if (strlen(ckpt->name) > len)
+ len = strlen(ckpt->name);
+ ++len;
+
+ WT_CKPT_FOREACH(ckptbase, ckpt) {
+ /*
+ * Call ctime, not ctime_r; ctime_r has portability problems,
+ * the Solaris version is different from the POSIX standard.
+ */
+ t = (time_t)ckpt->sec;
+ printf("\t%*s: %.24s", (int)len, ckpt->name, ctime(&t));
+
+ v = ckpt->ckpt_size;
+ if (v >= WT_PETABYTE)
+ printf(" (%" PRIu64 " PB)\n", v / WT_PETABYTE);
+ else if (v >= WT_TERABYTE)
+ printf(" (%" PRIu64 " TB)\n", v / WT_TERABYTE);
+ else if (v >= WT_GIGABYTE)
+ printf(" (%" PRIu64 " GB)\n", v / WT_GIGABYTE);
+ else if (v >= WT_MEGABYTE)
+ printf(" (%" PRIu64 " MB)\n", v / WT_MEGABYTE);
+ else if (v >= WT_KILOBYTE)
+ printf(" (%" PRIu64 " KB)\n", v / WT_KILOBYTE);
+ else
+ printf(" (%" PRIu64 " B)\n", v);
+ }
+
+ __wt_metadata_free_ckptlist(session, ckptbase);
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "list [-cv] [uri]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c
new file mode 100644
index 00000000000..7d9dfa445dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load.c
@@ -0,0 +1,595 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+#include "util_load.h"
+
+static int config_read(char ***, int *);
+static int config_rename(char **, const char *);
+static void config_remove(char *, const char *);
+static int format(void);
+static int insert(WT_CURSOR *, const char *);
+static int load_dump(WT_SESSION *);
+static int usage(void);
+
+static int append; /* -a append (ignore record number keys) */
+static char *cmdname; /* -r rename */
+static char **cmdconfig; /* configuration pairs */
+static int json; /* -j input is JSON format */
+static int no_overwrite; /* -n don't overwrite existing data */
+
+int
+util_load(WT_SESSION *session, int argc, char *argv[])
+{
+ int ch;
+ const char *filename;
+ uint32_t flags;
+
+ flags = 0;
+
+ filename = "<stdin>";
+ while ((ch = __wt_getopt(progname, argc, argv, "af:jnr:")) != EOF)
+ switch (ch) {
+ case 'a': /* append (ignore record number keys) */
+ append = 1;
+ break;
+ case 'f': /* input file */
+ if (freopen(__wt_optarg, "r", stdin) == NULL)
+ return (
+ util_err(errno, "%s: reopen", __wt_optarg));
+ else
+ filename = __wt_optarg;
+ break;
+ case 'j': /* input is JSON */
+ json = 1;
+ break;
+ case 'n': /* don't overwrite existing data */
+ no_overwrite = 1;
+ break;
+ case 'r': /* rename */
+ cmdname = __wt_optarg;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* -a and -o are mutually exclusive. */
+ if (append == 1 && no_overwrite == 1)
+ return (util_err(EINVAL,
+ "the -a (append) and -n (no-overwrite) flags are mutually "
+ "exclusive"));
+
+ /* The remaining arguments are configuration uri/string pairs. */
+ if (argc != 0) {
+ if (argc % 2 != 0)
+ return (usage());
+ cmdconfig = argv;
+ }
+
+ if (json) {
+ if (append)
+ flags |= LOAD_JSON_APPEND;
+ if (no_overwrite)
+ flags |= LOAD_JSON_NO_OVERWRITE;
+ return (util_load_json(session, filename, flags));
+ } else
+ return (load_dump(session));
+}
+
+/*
+ * load_dump --
+ * Load from the WiredTiger dump format.
+ */
+static int
+load_dump(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int hex, tret;
+ char **list, **tlist, *uri, config[64];
+
+ cursor = NULL;
+ list = NULL; /* -Wuninitialized */
+ hex = 0; /* -Wuninitialized */
+ uri = NULL;
+
+ /* Read the metadata file. */
+ if ((ret = config_read(&list, &hex)) != 0)
+ return (ret);
+
+ /* Reorder and check the list. */
+ if ((ret = config_reorder(list)) != 0)
+ goto err;
+
+ /* Update the config based on any command-line configuration. */
+ if ((ret = config_update(session, list)) != 0)
+ goto err;
+
+ uri = list[0];
+ /* Create the items in the list. */
+ if ((ret = config_exec(session, list)) != 0)
+ goto err;
+
+ /* Open the insert cursor. */
+ (void)snprintf(config, sizeof(config),
+ "dump=%s%s%s",
+ hex ? "hex" : "print",
+ append ? ",append" : "", no_overwrite ? ",overwrite=false" : "");
+ if ((ret = session->open_cursor(
+ session, uri, NULL, config, &cursor)) != 0) {
+ ret = util_err(ret, "%s: session.open", uri);
+ goto err;
+ }
+
+ /*
+ * Check the append flag (it only applies to objects where the primary
+ * key is a record number).
+ */
+ if (append && strcmp(cursor->key_format, "r") != 0) {
+ fprintf(stderr,
+ "%s: %s: -a option illegal unless the primary key is a "
+ "record number\n",
+ progname, uri);
+ ret = 1;
+ } else
+ ret = insert(cursor, uri);
+
+err: /*
+ * Technically, we don't have to close the cursor because the session
+ * handle will do it for us, but I'd like to see the flush to disk and
+ * the close succeed, it's better to fail early when loading files.
+ */
+ if (cursor != NULL && (tret = cursor->close(cursor)) != 0) {
+ tret = util_err(tret, "%s: cursor.close", uri);
+ if (ret == 0)
+ ret = tret;
+ }
+ if (ret == 0)
+ ret = util_flush(session, uri);
+
+ for (tlist = list; *tlist != NULL; ++tlist)
+ free(*tlist);
+ free(list);
+
+ return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * config_exec --
+ * Create the tables/indices/colgroups implied by the list.
+ */
+int
+config_exec(WT_SESSION *session, char **list)
+{
+ WT_DECL_RET;
+
+ for (; *list != NULL; list += 2)
+ if ((ret = session->create(session, list[0], list[1])) != 0)
+ return (util_err(ret, "%s: session.create", list[0]));
+ return (0);
+}
+
+/*
+ * config_list_free --
+ * Add a value to the config list.
+ */
+int
+config_list_add(CONFIG_LIST *clp, char *val)
+{
+ if (clp->entry + 1 >= clp->max_entry)
+ if ((clp->list = realloc(clp->list, (size_t)
+ (clp->max_entry += 100) * sizeof(char *))) == NULL)
+ /* List already freed by realloc. */
+ return (util_err(errno, NULL));
+
+ clp->list[clp->entry++] = val;
+ clp->list[clp->entry] = NULL;
+ return (0);
+}
+
+/*
+ * config_list_free --
+ * Free the list and any of its entries.
+ */
+void
+config_list_free(CONFIG_LIST *clp)
+{
+ char **entry;
+
+ if (clp->list != NULL)
+ for (entry = &clp->list[0]; *entry != NULL; entry++)
+ free(*entry);
+ free(clp->list);
+ clp->list = NULL;
+}
+
+/*
+ * config_read --
+ * Read the config lines and do some basic validation.
+ */
+static int
+config_read(char ***listp, int *hexp)
+{
+ ULINE l;
+ WT_DECL_RET;
+ int entry, eof, max_entry;
+ const char *s;
+ char **list, **tlist;
+
+ list = NULL;
+ memset(&l, 0, sizeof(l));
+
+ /* Header line #1: "WiredTiger Dump" and a WiredTiger version. */
+ if (util_read_line(&l, 0, &eof))
+ return (1);
+ s = "WiredTiger Dump ";
+ if (strncmp(l.mem, s, strlen(s)) != 0)
+ return (format());
+
+ /* Header line #2: "Format={hex,print}". */
+ if (util_read_line(&l, 0, &eof))
+ return (1);
+ if (strcmp(l.mem, "Format=print") == 0)
+ *hexp = 0;
+ else if (strcmp(l.mem, "Format=hex") == 0)
+ *hexp = 1;
+ else
+ return (format());
+
+ /* Header line #3: "Header". */
+ if (util_read_line(&l, 0, &eof))
+ return (1);
+ if (strcmp(l.mem, "Header") != 0)
+ return (format());
+
+ /* Now, read in lines until we get to the end of the headers. */
+ for (entry = max_entry = 0, list = NULL;; ++entry) {
+ if ((ret = util_read_line(&l, 0, &eof)) != 0)
+ goto err;
+ if (strcmp(l.mem, "Data") == 0)
+ break;
+
+ /*
+ * Grow the array of header lines as necessary -- we need an
+ * extra slot for NULL termination.
+ */
+ if (entry + 1 >= max_entry) {
+ if ((tlist = realloc(list, (size_t)
+ (max_entry += 100) * sizeof(char *))) == NULL) {
+ ret = util_err(errno, NULL);
+
+ /*
+ * List already freed by realloc, still use err
+ * label for consistency.
+ */
+ list = NULL;
+ goto err;
+ }
+ list = tlist;
+ }
+ if ((list[entry] = strdup(l.mem)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ list[entry + 1] = NULL;
+ }
+
+ /* Headers are required, and they're supposed to be in pairs. */
+ if (list == NULL || entry % 2 != 0) {
+ ret = format();
+ goto err;
+ }
+ *listp = list;
+ return (0);
+
+err: if (list != NULL) {
+ for (tlist = list; *tlist != NULL; ++tlist)
+ free(*tlist);
+ free(list);
+ }
+ return (ret);
+}
+
+/*
+ * config_reorder --
+ * For table dumps, reorder the list so tables are first.
+ * For other dumps, make any needed checks.
+ */
+int
+config_reorder(char **list)
+{
+ char **entry, *p;
+
+ /*
+ * Search for a table name -- if we find one, then it's table dump,
+ * otherwise, it's a single file dump.
+ */
+ for (entry = list; *entry != NULL; ++entry)
+ if (WT_PREFIX_MATCH(*entry, "table:"))
+ break;
+ if (*entry == NULL) {
+ /*
+ * Single file dumps can only have two lines, the file name and
+ * the configuration information.
+ */
+ if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) ||
+ (WT_PREFIX_MATCH(list[0], "file:") &&
+ WT_PREFIX_MATCH(list[0], "lsm:")))
+ return (format());
+
+ entry = list;
+ }
+
+ /*
+ * Make sure the table key/value pair comes first, then we can just
+ * run through the array in order. (We already checked that we had
+ * a multiple of 2 entries, so this is safe.)
+ */
+ if (entry != list) {
+ p = list[0]; list[0] = entry[0]; entry[0] = p;
+ p = list[1]; list[1] = entry[1]; entry[1] = p;
+ }
+ return (0);
+}
+
+/*
+ * config_update --
+ * Reconcile and update the command line configuration against the
+ * config we found.
+ */
+int
+config_update(WT_SESSION *session, char **list)
+{
+ int found;
+ const char *cfg[] = { NULL, NULL, NULL };
+ char **configp, **listp;
+ const char **rm;
+ static const char *rmnames[] = {
+ "filename", "id", "checkpoint", "checkpoint_lsn",
+ "version", "source", NULL };
+
+ /*
+ * If the object has been renamed, replace all of the column group,
+ * index, file and table names with the new name.
+ */
+ if (cmdname != NULL) {
+ for (listp = list; *listp != NULL; listp += 2)
+ if (WT_PREFIX_MATCH(*listp, "colgroup:") ||
+ WT_PREFIX_MATCH(*listp, "file:") ||
+ WT_PREFIX_MATCH(*listp, "index:") ||
+ WT_PREFIX_MATCH(*listp, "table:"))
+ if (config_rename(listp, cmdname))
+ return (1);
+
+ /*
+ * If the object was renamed, and there are configuration pairs,
+ * rename the configuration pairs as well, because we don't know
+ * if the user used the old or new names for the pair's URI.
+ */
+ for (configp = cmdconfig;
+ cmdconfig != NULL && *configp != NULL; configp += 2)
+ if (config_rename(configp, cmdname))
+ return (1);
+ }
+
+ /*
+ * Remove all "filename=", "source=" and other configurations
+ * that foil loading from the values. New filenames are chosen
+ * as part of table load.
+ */
+ for (listp = list; *listp != NULL; listp += 2)
+ for (rm = rmnames; *rm != NULL; rm++)
+ if (strstr(listp[1], *rm) != NULL)
+ config_remove(listp[1], *rm);
+
+ /*
+ * It's possible to update everything except the key/value formats.
+ * If there were command-line configuration pairs, walk the list of
+ * command-line configuration strings, and check.
+ */
+ for (configp = cmdconfig;
+ cmdconfig != NULL && *configp != NULL; configp += 2)
+ if (strstr(configp[1], "key_format=") ||
+ strstr(configp[1], "value_format="))
+ return (util_err(0,
+ "the command line configuration string may not "
+ "modify the object's key or value format"));
+
+ /*
+ * If there were command-line configuration pairs, walk the list of
+ * command-line URIs and find a matching dump URI. For each match,
+ * rewrite the dump configuration as described by the command-line
+ * configuration. It is an error if a command-line URI doesn't find
+ * a single, exact match, that's likely a mistake.
+ */
+ for (configp = cmdconfig;
+ cmdconfig != NULL && *configp != NULL; configp += 2) {
+ found = 0;
+ for (listp = list; *listp != NULL; listp += 2) {
+ if (strncmp(*configp, listp[0], strlen(*configp)) != 0)
+ continue;
+ /*
+ * !!!
+ * We support JSON configuration strings, which leads to
+ * configuration strings with brackets. Unfortunately,
+ * that implies we can't simply append new configuration
+ * strings to existing ones. We call an unpublished
+ * WiredTiger API to do the concatenation: if anyone
+ * else ever needs it we can make it public, but I think
+ * that's unlikely. We're also playing fast and loose
+ * with types, but it should work.
+ */
+ cfg[0] = listp[1];
+ cfg[1] = configp[1];
+ if (__wt_config_concat(
+ (WT_SESSION_IMPL *)session, cfg,
+ (const char **)&listp[1]) != 0)
+ return (1);
+ ++found;
+ }
+ switch (found) {
+ case 0:
+ return (util_err(0,
+ "the command line object name %s was not matched "
+ "by any loaded object name", *configp));
+ case 1:
+ break;
+ default:
+ return (util_err(0,
+ "the command line object name %s was not unique, "
+ "matching more than a single loaded object name",
+ *configp));
+ }
+ }
+
+ /* Leak the memory, I don't care. */
+ return (0);
+}
+
+/*
+ * config_rename --
+ * Update the URI name.
+ */
+static int
+config_rename(char **urip, const char *name)
+{
+ size_t len;
+ char *buf, *p;
+
+ /* Allocate room. */
+ len = strlen(*urip) + strlen(name) + 10;
+ if ((buf = malloc(len)) == NULL)
+ return (util_err(errno, NULL));
+
+ /*
+ * Find the separating colon characters, but not the trailing one may
+ * not be there.
+ */
+ if ((p = strchr(*urip, ':')) == NULL) {
+ free(buf);
+ return (format());
+ }
+ *p = '\0';
+ p = strchr(p + 1, ':');
+ snprintf(buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p);
+ *urip = buf;
+
+ return (0);
+}
+
+/*
+ * config_remove --
+ * Remove a single config key and its value.
+ */
+static void
+config_remove(char *config, const char *ckey)
+{
+ int parens, quoted;
+ char *begin, match[100], *next, *p;
+
+ snprintf(match, sizeof(match), "%s=", ckey);
+ if ((begin = strstr(config, match)) != NULL) {
+ parens = 0;
+ quoted = 0;
+ next = NULL;
+ for (p = begin + strlen(match); !next && *p; p++)
+ switch (*p) {
+ case '(':
+ if (!quoted)
+ parens++;
+ break;
+ case ')':
+ if (!quoted)
+ parens--;
+ break;
+ case '"':
+ quoted = !quoted;
+ break;
+ case ',':
+ if (!quoted && parens == 0)
+ next = p + 1;
+ break;
+ }
+ if (next)
+ memmove(begin, next, strlen(next) + 1);
+ else
+ *begin = '\0';
+ }
+}
+
+/*
+ * format --
+ * The input doesn't match the dump format.
+ */
+static int
+format(void)
+{
+ return (util_err(0, "input does not match WiredTiger dump format"));
+}
+
+/*
+ * insert --
+ * Read and insert data.
+ */
+static int
+insert(WT_CURSOR *cursor, const char *name)
+{
+ ULINE key, value;
+ WT_DECL_RET;
+ uint64_t insert_count;
+ int eof;
+
+ memset(&key, 0, sizeof(key));
+ memset(&value, 0, sizeof(value));
+
+ /* Read key/value pairs and insert them into the file. */
+ for (insert_count = 0;;) {
+ /*
+ * Three modes: in row-store, we always read a key and use it,
+ * in column-store, we might read it (a dump), we might read
+ * and ignore it (a dump with "append" set), or not read it at
+ * all (flat-text load).
+ */
+ if (util_read_line(&key, 1, &eof))
+ return (1);
+ if (eof == 1)
+ break;
+ if (!append)
+ cursor->set_key(cursor, key.mem);
+
+ if (util_read_line(&value, 0, &eof))
+ return (1);
+ cursor->set_value(cursor, value.mem);
+
+ if ((ret = cursor->insert(cursor)) != 0)
+ return (util_err(ret, "%s: cursor.insert", name));
+
+ /* Report on progress every 100 inserts. */
+ if (verbose && ++insert_count % 100 == 0) {
+ printf("\r\t%s: %" PRIu64, name, insert_count);
+ fflush(stdout);
+ }
+ }
+
+ if (verbose)
+ printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
+
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "load [-as] [-f input-file] [-r name] [object configuration ...]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_load.h b/src/third_party/wiredtiger/src/utilities/util_load.h
new file mode 100644
index 00000000000..7bca677e178
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * A list of configuration strings.
+ */
+typedef struct {
+ char **list; /* array of alternating (uri, config) values */
+ int entry; /* next entry available in list */
+ int max_entry; /* how many allocated in list */
+} CONFIG_LIST;
+
+int config_exec(WT_SESSION *, char **);
+int config_list_add(CONFIG_LIST *, char *);
+void config_list_free(CONFIG_LIST *);
+int config_reorder(char **);
+int config_update(WT_SESSION *, char **);
+
+/* Flags for util_load_json */
+#define LOAD_JSON_APPEND 0x0001 /* append (ignore record number keys) */
+#define LOAD_JSON_NO_OVERWRITE 0x0002 /* don't overwrite existing data */
+
+int util_load_json(WT_SESSION *, const char *, uint32_t);
diff --git a/src/third_party/wiredtiger/src/utilities/util_load_json.c b/src/third_party/wiredtiger/src/utilities/util_load_json.c
new file mode 100644
index 00000000000..fb61df9ab16
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_load_json.c
@@ -0,0 +1,573 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+#include "util_load.h"
+
+/*
+ * Encapsulates the input state for parsing JSON.
+ *
+ * At any time, we may be peeking at an unconsumed token; this is
+ * indicated by 'peeking' as true. toktype, tokstart, toklen will be
+ * set in this case.
+ *
+ * Generally we are collecting and processing tokens one by one.
+ * In JSON, tokens never span lines so this makes processing easy.
+ * The exception is that a JSON dump cursor takes the complete
+ * set of keys or values during cursor->set_key/set_value calls,
+ * which may contain many tokens and span lines. E.g.
+ * cursor->set_value("\"name\" : \"John\", \"phone\" : 2348765");
+ * The raw key/value string is collected in the kvraw field.
+ */
+typedef struct {
+ WT_SESSION *session; /* associated session */
+ ULINE line; /* current line */
+ const char *p; /* points to cur position in line.mem */
+ int ateof; /* current token is EOF */
+ int peeking; /* peeking at next token */
+ int toktype; /* next token, defined by __wt_json_token() */
+ const char *tokstart; /* next token start (points into line.mem) */
+ size_t toklen; /* next token length */
+ char *kvraw; /* multiple line raw content collected so far */
+ size_t kvrawstart; /* pos on cur line that JSON key/value starts */
+ const char *filename; /* filename for error reporting */
+ int linenum; /* line number for error reporting */
+} JSON_INPUT_STATE;
+
+static int json_column_group_index(WT_SESSION *, JSON_INPUT_STATE *,
+ CONFIG_LIST *, int);
+static int json_data(WT_SESSION *, JSON_INPUT_STATE *, CONFIG_LIST *, uint32_t);
+static int json_expect(WT_SESSION *, JSON_INPUT_STATE *, int);
+static int json_peek(WT_SESSION *, JSON_INPUT_STATE *);
+static int json_skip(WT_SESSION *, JSON_INPUT_STATE *, const char **);
+static int json_kvraw_append(JSON_INPUT_STATE *, const char *, size_t);
+static int json_strdup(JSON_INPUT_STATE *, char **);
+static int json_top_level(WT_SESSION *, JSON_INPUT_STATE *, uint32_t);
+
+#define JSON_STRING_MATCH(ins, match) \
+ ((ins)->toklen - 2 == strlen(match) && \
+ strncmp((ins)->tokstart + 1, (match), (ins)->toklen - 2) == 0)
+
+#define JSON_INPUT_POS(ins) \
+ ((size_t)((ins)->p - (const char *)(ins)->line.mem))
+
+#define JSON_EXPECT(session, ins, tok) do { \
+ if (json_expect(session, ins, tok)) \
+ goto err; \
+} while (0)
+
+/*
+ * json_column_group_index --
+ * Parse a column group or index entry from JSON input.
+ */
+static int
+json_column_group_index(WT_SESSION *session, JSON_INPUT_STATE *ins,
+ CONFIG_LIST *clp, int idx)
+{
+ WT_DECL_RET;
+ char *config, *p, *uri;
+ int isconfig;
+
+ uri = NULL;
+ config = NULL;
+
+ while (json_peek(session, ins) == '{') {
+ JSON_EXPECT(session, ins, '{');
+ JSON_EXPECT(session, ins, 's');
+ isconfig = JSON_STRING_MATCH(ins, "config");
+ if (!isconfig && !JSON_STRING_MATCH(ins, "uri"))
+ goto err;
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, 's');
+
+ if ((ret = json_strdup(ins, &p)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if (isconfig)
+ config = p;
+ else
+ uri = p;
+
+ isconfig = !isconfig;
+ JSON_EXPECT(session, ins, ',');
+ JSON_EXPECT(session, ins, 's');
+ if (!JSON_STRING_MATCH(ins, isconfig ? "config" : "uri"))
+ goto err;
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, 's');
+
+ if ((ret = json_strdup(ins, &p)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if (isconfig)
+ config = p;
+ else
+ uri = p;
+ JSON_EXPECT(session, ins, '}');
+ if ((idx && strncmp(uri, "index:", 6) != 0) ||
+ (!idx && strncmp(uri, "colgroup:", 9) != 0)) {
+ ret = util_err(EINVAL,
+ "%s: misplaced colgroup or index", uri);
+ goto err;
+ }
+ if ((ret = config_list_add(clp, uri)) != 0 ||
+ (ret = config_list_add(clp, config)) != 0)
+ goto err;
+
+ if (json_peek(session, ins) != ',')
+ break;
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != '{')
+ goto err;
+ }
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * json_kvraw_append --
+ * Append to the kvraw buffer, which is used to collect all the
+ * raw key/value pairs from JSON input.
+ */
+static int json_kvraw_append(JSON_INPUT_STATE *ins, const char *str, size_t len)
+{
+ char *tmp;
+ size_t needsize;
+
+ if (len > 0) {
+ needsize = strlen(ins->kvraw) + len + 2;
+ if ((tmp = malloc(needsize)) == NULL)
+ return (util_err(errno, NULL));
+ snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str);
+ free(ins->kvraw);
+ ins->kvraw = tmp;
+ }
+ return (0);
+}
+
+/*
+ * json_strdup --
+ * Return a string, with no escapes or other JSON-isms, from the
+ * JSON string at the current input position.
+ */
+static int
+json_strdup(JSON_INPUT_STATE *ins, char **resultp)
+{
+ WT_DECL_RET;
+ char *result, *resultcpy;
+ const char *src;
+ ssize_t resultlen;
+ size_t srclen;
+
+ result = NULL;
+ src = ins->tokstart + 1; /*strip "" from token */
+ srclen = ins->toklen - 2;
+ if ((resultlen = __wt_json_strlen(src, srclen)) < 0) {
+ ret = util_err(EINVAL, "Invalid config string");
+ goto err;
+ }
+ resultlen += 1;
+ if ((result = (char *)malloc((size_t)resultlen)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ *resultp = result;
+ resultcpy = result;
+ if ((ret = __wt_json_strncpy(&resultcpy, (size_t)resultlen, src,
+ srclen))
+ != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ if (result != NULL)
+ free(result);
+ *resultp = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * json_data --
+ * Parse the data portion of the JSON input, and insert all
+ * values.
+ */
+static int
+json_data(WT_SESSION *session, JSON_INPUT_STATE *ins, CONFIG_LIST *clp,
+ uint32_t flags)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ char config[64], *endp, *uri;
+ const char *keyformat;
+ int isrec, nfield, nkeys, toktype, tret;
+ size_t keystrlen;
+ ssize_t gotnolen;
+ uint64_t gotno, recno;
+
+ cursor = NULL;
+ uri = NULL;
+
+ /* Reorder and check the list. */
+ if ((ret = config_reorder(clp->list)) != 0)
+ goto err;
+
+ /* Update config based on command-line configuration. */
+ if ((ret = config_update(session, clp->list)) != 0)
+ goto err;
+
+ /* Create the items collected. */
+ if ((ret = config_exec(session, clp->list)) != 0)
+ goto err;
+
+ uri = clp->list[0];
+ (void)snprintf(config, sizeof(config),
+ "dump=json%s%s",
+ LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "",
+ LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "");
+ if ((ret = session->open_cursor(
+ session, uri, NULL, config, &cursor)) != 0) {
+ ret = util_err(ret, "%s: session.open", uri);
+ goto err;
+ }
+ keyformat = cursor->key_format;
+ isrec = (strcmp(keyformat, "r") == 0);
+ for (nkeys = 0; *keyformat; keyformat++)
+ if (!isdigit(*keyformat))
+ nkeys++;
+
+ recno = 0;
+ while (json_peek(session, ins) == '{') {
+ nfield = 0;
+ JSON_EXPECT(session, ins, '{');
+ if (ins->kvraw == NULL) {
+ if ((ins->kvraw = (char *)malloc(1)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ }
+ ins->kvraw[0] = '\0';
+ ins->kvrawstart = JSON_INPUT_POS(ins);
+ keystrlen = 0;
+ while (json_peek(session, ins) == 's') {
+ JSON_EXPECT(session, ins, 's');
+ JSON_EXPECT(session, ins, ':');
+ toktype = json_peek(session, ins);
+ JSON_EXPECT(session, ins, toktype);
+ if (isrec && nfield == 0) {
+ /* Verify the dump has recnos in order. */
+ recno++;
+ gotno = __wt_strtouq(ins->tokstart, &endp, 0);
+ gotnolen = (endp - ins->tokstart);
+ if (recno != gotno ||
+ ins->toklen != (size_t)gotnolen) {
+ ret = util_err(0,
+ "%s: recno out of order", uri);
+ goto err;
+ }
+ }
+ if (++nfield == nkeys) {
+ size_t curpos = JSON_INPUT_POS(ins);
+ if ((ret = json_kvraw_append(ins,
+ (char *)ins->line.mem + ins->kvrawstart,
+ curpos - ins->kvrawstart)) != 0)
+ goto err;
+ ins->kvrawstart = curpos;
+ keystrlen = strlen(ins->kvraw);
+ }
+ if (json_peek(session, ins) != ',')
+ break;
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != 's')
+ goto err;
+ }
+ if (json_kvraw_append(ins, ins->line.mem, JSON_INPUT_POS(ins)))
+ goto err;
+
+ ins->kvraw[keystrlen] = '\0';
+ if (!LF_ISSET(LOAD_JSON_APPEND))
+ cursor->set_key(cursor, ins->kvraw);
+ /* skip over inserted space and comma */
+ cursor->set_value(cursor, &ins->kvraw[keystrlen+2]);
+ if ((ret = cursor->insert(cursor)) != 0) {
+ ret = util_err(ret, "%s: cursor.insert", uri);
+ goto err;
+ }
+
+ JSON_EXPECT(session, ins, '}');
+ if (json_peek(session, ins) != ',')
+ break;
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != '{')
+ goto err;
+ }
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ }
+ /*
+ * Technically, we don't have to close the cursor because the session
+ * handle will do it for us, but I'd like to see the flush to disk and
+ * the close succeed, it's better to fail early when loading files.
+ */
+ if (cursor != NULL && (tret = cursor->close(cursor)) != 0) {
+ tret = util_err(tret, "%s: cursor.close", uri);
+ if (ret == 0)
+ ret = tret;
+ }
+ if (ret == 0)
+ ret = util_flush(session, uri);
+ return (ret);
+}
+
+/*
+ * json_top_level --
+ * Parse the top level JSON input.
+ */
+static int
+json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags)
+{
+ CONFIG_LIST cl;
+ WT_DECL_RET;
+ char *config, *tableuri;
+ int toktype;
+ static const char *json_markers[] = {
+ "\"config\"", "\"colgroups\"", "\"indices\"", "\"data\"", NULL };
+
+ memset(&cl, 0, sizeof(cl));
+ tableuri = NULL;
+ JSON_EXPECT(session, ins, '{');
+ while (json_peek(session, ins) == 's') {
+ JSON_EXPECT(session, ins, 's');
+ tableuri = realloc(tableuri, ins->toklen);
+ snprintf(tableuri, ins->toklen, "%.*s",
+ (int)(ins->toklen - 2), ins->tokstart + 1);
+ JSON_EXPECT(session, ins, ':');
+
+ /*
+ * Allow any ordering of 'config', 'colgroups',
+ * 'indices' before 'data', which must appear last.
+ * The non-'data' items build up a list of entries
+ * that created in our session before the data is
+ * inserted.
+ */
+ for (;;) {
+ if (json_skip(session, ins, json_markers) != 0)
+ goto err;
+ JSON_EXPECT(session, ins, 's');
+ if (JSON_STRING_MATCH(ins, "config")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, 's');
+ if ((ret = json_strdup(ins, &config)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if ((ret = config_list_add(&cl, tableuri)) != 0)
+ goto err;
+ if ((ret = config_list_add(&cl, config)) != 0)
+ goto err;
+ tableuri = NULL;
+ } else if (JSON_STRING_MATCH(ins, "colgroups")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, '[');
+ if ((ret = json_column_group_index(
+ session, ins, &cl, 0)) != 0)
+ goto err;
+ JSON_EXPECT(session, ins, ']');
+ } else if (JSON_STRING_MATCH(ins, "indices")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, '[');
+ if ((ret = json_column_group_index(
+ session, ins, &cl, 1)) != 0)
+ goto err;
+ JSON_EXPECT(session, ins, ']');
+ } else if (JSON_STRING_MATCH(ins, "data")) {
+ JSON_EXPECT(session, ins, ':');
+ JSON_EXPECT(session, ins, '[');
+ if ((ret = json_data(session, ins, &cl,
+ flags)) != 0)
+ goto err;
+ config_list_free(&cl);
+ break;
+ }
+ else
+ goto err;
+ }
+
+ while ((toktype = json_peek(session, ins)) == '}' ||
+ toktype == ']')
+ JSON_EXPECT(session, ins, toktype);
+ if (toktype == 0) /* Check EOF. */
+ break;
+ if (toktype == ',') {
+ JSON_EXPECT(session, ins, ',');
+ if (json_peek(session, ins) != 's')
+ goto err;
+ continue;
+ }
+ }
+ JSON_EXPECT(session, ins, 0);
+
+ if (0) {
+err: if (ret == 0)
+ ret = EINVAL;
+ }
+ config_list_free(&cl);
+ if (tableuri != NULL)
+ free(tableuri);
+ return (ret);
+}
+
+/*
+ * json_peek --
+ * Set the input state to the next available token in the input
+ * and return its token type, a code defined by __wt_json_token().
+ */
+static int
+json_peek(WT_SESSION *session, JSON_INPUT_STATE *ins)
+{
+ WT_DECL_RET;
+
+ if (!ins->peeking) {
+ while (!ins->ateof) {
+ while (isspace(*ins->p))
+ ins->p++;
+ if (*ins->p)
+ break;
+ if (ins->kvraw != NULL) {
+ if (json_kvraw_append(ins,
+ (char *)ins->line.mem + ins->kvrawstart,
+ strlen(ins->line.mem) - ins->kvrawstart)) {
+ ret = -1;
+ goto err;
+ }
+ ins->kvrawstart = 0;
+ }
+ if (util_read_line(&ins->line, 1,
+ &ins->ateof)) {
+ ins->toktype = -1;
+ ret = -1;
+ goto err;
+ }
+ ins->linenum++;
+ ins->p = (const char *)ins->line.mem;
+ }
+ if (ins->ateof)
+ ins->toktype = 0;
+ else if (__wt_json_token(session, ins->p,
+ &ins->toktype, &ins->tokstart,
+ &ins->toklen) != 0)
+ ins->toktype = -1;
+ ins->peeking = 1;
+ }
+ if (0) {
+ err: if (ret == 0)
+ ret = -1;
+ }
+ return (ret == 0 ? ins->toktype : -1);
+}
+
+/*
+ * json_expect --
+ * Ensure that the type of the next token in the input matches
+ * the wanted value, and advance past it. The values of the
+ * input state will be set so specific string or integer values
+ * can be pulled out after this call.
+ */
+static int
+json_expect(WT_SESSION *session, JSON_INPUT_STATE *ins, int wanttok)
+{
+ if (json_peek(session, ins) < 0)
+ return (1);
+ ins->p += ins->toklen;
+ ins->peeking = 0;
+ if (ins->toktype != wanttok) {
+ fprintf(stderr,
+ "%s: %d: %" WT_SIZET_FMT ": expected %s, got %s\n",
+ ins->filename,
+ ins->linenum,
+ JSON_INPUT_POS(ins) + 1,
+ __wt_json_tokname(wanttok),
+ __wt_json_tokname(ins->toktype));
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * json_skip --
+ * Skip over JSON input until one of the specified strings appears.
+ * The tokenizer will be set to point to the beginning of
+ * that string.
+ */
+static int
+json_skip(WT_SESSION *session, JSON_INPUT_STATE *ins, const char **matches)
+{
+ const char *hit;
+ const char **match;
+
+ if (ins->kvraw != NULL)
+ return (1);
+
+ hit = NULL;
+ while (!ins->ateof) {
+ for (match = matches; *match != NULL; match++)
+ if ((hit = strstr(ins->p, *match)) != NULL)
+ goto out;
+ if (util_read_line(&ins->line, 1, &ins->ateof)) {
+ ins->toktype = -1;
+ return (1);
+ }
+ ins->linenum++;
+ ins->p = (const char *)ins->line.mem;
+ }
+out:
+ if (hit == NULL)
+ return (1);
+
+ /* Set to this token. */
+ ins->p = hit;
+ ins->peeking = 0;
+ ins->toktype = 0;
+ (void)json_peek(session, ins);
+ return (0);
+}
+
+/*
+ * load_json --
+ * Load from the JSON format produced by 'wt dump -j'.
+ */
+int
+util_load_json(WT_SESSION *session, const char *filename, uint32_t flags)
+{
+ JSON_INPUT_STATE instate;
+ WT_DECL_RET;
+
+ memset(&instate, 0, sizeof(instate));
+ instate.session = session;
+ if (util_read_line(&instate.line, 0, &instate.ateof))
+ return (1);
+ instate.p = (const char *)instate.line.mem;
+ instate.linenum = 1;
+ instate.filename = filename;
+
+ if ((ret = json_top_level(session, &instate, flags)) != 0)
+ goto err;
+
+err: if (instate.line.mem != NULL)
+ free(instate.line.mem);
+ free(instate.kvraw);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_loadtext.c b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
new file mode 100644
index 00000000000..27c4c23b50c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_loadtext.c
@@ -0,0 +1,157 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int insert(WT_CURSOR *, const char *, int);
+static int text(WT_SESSION *, const char *);
+static int usage(void);
+
+int
+util_loadtext(WT_SESSION *session, int argc, char *argv[])
+{
+ int ch;
+ const char *uri;
+
+ while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF)
+ switch (ch) {
+ case 'f': /* input file */
+ if (freopen(__wt_optarg, "r", stdin) == NULL)
+ return (
+ util_err(errno, "%s: reopen", __wt_optarg));
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the uri. */
+ if (argc != 1)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ return (text(session, uri));
+}
+
+/*
+ * text --
+ * Load flat-text into a file/table.
+ */
+static int
+text(WT_SESSION *session, const char *uri)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ int readkey, tret;
+
+ /*
+ * Open the cursor, configured to append new records (in the case of
+ * column-store objects), or to overwrite existing strings (in the
+ * case of row-store objects). The two flags are mutually exclusive,
+ * but the library doesn't currently care that we set both of them.
+ */
+ if ((ret = session->open_cursor(
+ session, uri, NULL, "append,overwrite", &cursor)) != 0)
+ return (util_err(ret, "%s: session.open", uri));
+
+ /*
+ * We're about to load strings, make sure the formats match.
+ *
+ * Row-store tables have key/value pairs, column-store tables only have
+ * values.
+ */
+ if (strcmp(cursor->value_format, "S") != 0 ||
+ (strcmp(cursor->key_format, "S") != 0 &&
+ strcmp(cursor->key_format, "r") != 0))
+ return (util_err(EINVAL,
+ "the loadtext command can only load objects configured "
+ "for record number or string keys, and string values"));
+ readkey = strcmp(cursor->key_format, "r") == 0 ? 0 : 1;
+
+ /* Insert the records */
+ ret = insert(cursor, uri, readkey);
+
+ /*
+ * Technically, we don't have to close the cursor because the session
+ * handle will do it for us, but I'd like to see the flush to disk and
+ * the close succeed, it's better to fail early when loading files.
+ */
+ if ((tret = cursor->close(cursor)) != 0) {
+ tret = util_err(tret, "%s: cursor.close", uri);
+ if (ret == 0)
+ ret = tret;
+ }
+ if (ret == 0)
+ ret = util_flush(session, uri);
+
+ return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * insert --
+ * Read and insert data.
+ */
+static int
+insert(WT_CURSOR *cursor, const char *name, int readkey)
+{
+ ULINE key, value;
+ WT_DECL_RET;
+ uint64_t insert_count;
+ int eof;
+
+ memset(&key, 0, sizeof(key));
+ memset(&value, 0, sizeof(value));
+
+ /* Read key/value pairs and insert them into the file. */
+ for (insert_count = 0;;) {
+ /*
+ * Three modes: in row-store, we always read a key and use it,
+ * in column-store, we might read it (a dump), we might read
+ * and ignore it (a dump with "append" set), or not read it at
+ * all (flat-text load).
+ */
+ if (readkey) {
+ if (util_read_line(&key, 1, &eof))
+ return (1);
+ if (eof == 1)
+ break;
+ cursor->set_key(cursor, key.mem);
+ }
+ if (util_read_line(&value, readkey ? 0 : 1, &eof))
+ return (1);
+ if (eof == 1)
+ break;
+ cursor->set_value(cursor, value.mem);
+
+ if ((ret = cursor->insert(cursor)) != 0)
+ return (util_err(ret, "%s: cursor.insert", name));
+
+ /* Report on progress every 100 inserts. */
+ if (verbose && ++insert_count % 100 == 0) {
+ printf("\r\t%s: %" PRIu64, name, insert_count);
+ fflush(stdout);
+ }
+ }
+
+ if (verbose)
+ printf("\r\t%s: %" PRIu64 "\n", name, insert_count);
+
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "loadtext [-f input-file] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_main.c b/src/third_party/wiredtiger/src/utilities/util_main.c
new file mode 100644
index 00000000000..04ab59f1ca9
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_main.c
@@ -0,0 +1,262 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+const char *home = "."; /* Home directory */
+const char *progname; /* Program name */
+ /* Global arguments */
+const char *usage_prefix = "[-Vv] [-C config] [-h home]";
+int verbose; /* Verbose flag */
+
+static const char *command; /* Command name */
+
+static int usage(void);
+
+int
+main(int argc, char *argv[])
+{
+ WT_CONNECTION *conn;
+ WT_DECL_RET;
+ WT_SESSION *session;
+ size_t len;
+ int ch, major_v, minor_v, tret, (*func)(WT_SESSION *, int, char *[]);
+ char *p;
+ const char *cmd_config, *config;
+
+ conn = NULL;
+ p = NULL;
+
+ /* Get the program name. */
+ if ((progname = strrchr(argv[0], '/')) == NULL)
+ progname = argv[0];
+ else
+ ++progname;
+ command = "";
+
+ /* Check the version against the library build. */
+ (void)wiredtiger_version(&major_v, & minor_v, NULL);
+ if (major_v != WIREDTIGER_VERSION_MAJOR ||
+ minor_v != WIREDTIGER_VERSION_MINOR) {
+ fprintf(stderr,
+ "%s: program build version %d.%d does not match "
+ "library build version %d.%d\n",
+ progname,
+ WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR,
+ major_v, minor_v);
+ return (EXIT_FAILURE);
+ }
+
+ /* Check for standard options. */
+ cmd_config = config = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "C:h:Vv")) != EOF)
+ switch (ch) {
+ case 'C': /* wiredtiger_open config */
+ cmd_config = __wt_optarg;
+ break;
+ case 'h': /* home directory */
+ home = __wt_optarg;
+ break;
+ case 'V': /* version */
+ printf("%s\n", wiredtiger_version(NULL, NULL, NULL));
+ return (EXIT_SUCCESS);
+ case 'v': /* verbose */
+ verbose = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The next argument is the command name. */
+ if (argc < 1)
+ return (usage());
+ command = argv[0];
+
+ /* Reset getopt. */
+ __wt_optreset = __wt_optind = 1;
+
+ func = NULL;
+ switch (command[0]) {
+ case 'b':
+ if (strcmp(command, "backup") == 0)
+ func = util_backup;
+ break;
+ case 'c':
+ if (strcmp(command, "compact") == 0)
+ func = util_compact;
+ else if (strcmp(command, "copyright") == 0) {
+ util_copyright();
+ return (EXIT_SUCCESS);
+ } else if (strcmp(command, "create") == 0) {
+ func = util_create;
+ config = "create";
+ }
+ break;
+ case 'd':
+ if (strcmp(command, "drop") == 0)
+ func = util_drop;
+ else if (strcmp(command, "dump") == 0)
+ func = util_dump;
+ break;
+ case 'l':
+ if (strcmp(command, "list") == 0)
+ func = util_list;
+ else if (strcmp(command, "load") == 0) {
+ func = util_load;
+ config = "create";
+ } else if (strcmp(command, "loadtext") == 0) {
+ func = util_loadtext;
+ config = "create";
+ }
+ break;
+ case 'p':
+ if (strcmp(command, "printlog") == 0)
+ func = util_printlog;
+ break;
+ case 'r':
+ if (strcmp(command, "read") == 0)
+ func = util_read;
+ else if (strcmp(command, "rename") == 0)
+ func = util_rename;
+ break;
+ case 's':
+ if (strcmp(command, "salvage") == 0)
+ func = util_salvage;
+ else if (strcmp(command, "stat") == 0) {
+ func = util_stat;
+ config = "statistics=(all)";
+ }
+ break;
+ case 'u':
+ if (strcmp(command, "upgrade") == 0)
+ func = util_upgrade;
+ break;
+ case 'v':
+ if (strcmp(command, "verify") == 0)
+ func = util_verify;
+ break;
+ case 'w':
+ if (strcmp(command, "write") == 0)
+ func = util_write;
+ break;
+ default:
+ break;
+ }
+ if (func == NULL)
+ return (usage());
+
+ /* Build the configuration string, as necessary. */
+ if (config == NULL)
+ config = cmd_config;
+ else if (cmd_config != NULL) {
+ len = strlen(cmd_config) + strlen(config) + 10;
+ if ((p = malloc(len)) == NULL) {
+ ret = util_err(errno, NULL);
+ goto err;
+ }
+ (void)snprintf(p, len, "%s,%s", config, cmd_config);
+ config = p;
+ }
+
+ /* Open the database and a session. */
+ if ((ret = wiredtiger_open(home,
+ verbose ? verbose_handler : NULL, config, &conn)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ ret = util_err(ret, NULL);
+ goto err;
+ }
+
+ /* Call the function. */
+ ret = func(session, argc, argv);
+
+ /* Close the database. */
+
+err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0)
+ ret = tret;
+
+ if (p != NULL)
+ free(p);
+
+ return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+static int
+usage(void)
+{
+ fprintf(stderr,
+ "WiredTiger Data Engine (version %d.%d)\n",
+ WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR);
+ fprintf(stderr,
+ "global options:\n"
+ "\t" "-C\twiredtiger_open configuration\n"
+ "\t" "-h\tdatabase directory\n"
+ "\t" "-V\tdisplay library version and exit\n"
+ "\t" "-v\tverbose\n");
+ fprintf(stderr,
+ "commands:\n"
+ "\t" "backup\t database backup\n"
+ "\t" "compact\t compact an object\n"
+ "\t" "copyright copyright information\n"
+ "\t" "create\t create an object\n"
+ "\t" "drop\t drop an object\n"
+ "\t" "dump\t dump an object\n"
+ "\t" "list\t list database objects\n"
+ "\t" "load\t load an object\n"
+ "\t" "loadtext\t load an object from a text file\n"
+ "\t" "printlog display the database log\n"
+ "\t" "read\t read values from an object\n"
+ "\t" "rename\t rename an object\n"
+ "\t" "salvage\t salvage a file\n"
+ "\t" "stat\t display statistics for an object\n"
+ "\t" "upgrade\t upgrade an object\n"
+ "\t" "verify\t verify an object\n"
+ "\t" "write\t write values to an object\n");
+
+ return (EXIT_FAILURE);
+}
+
+/*
+ * util_name --
+ * Build a name.
+ */
+char *
+util_name(const char *s, const char *type)
+{
+ size_t len;
+ char *name;
+
+ if (WT_PREFIX_MATCH(s, "backup:") ||
+ WT_PREFIX_MATCH(s, "config:") ||
+ WT_PREFIX_MATCH(s, "statistics:")) {
+ fprintf(stderr,
+ "%s: %s: unsupported object type: %s\n",
+ progname, command, s);
+ return (NULL);
+ }
+
+ len = strlen(type) + strlen(s) + 2;
+ if ((name = calloc(len, 1)) == NULL) {
+ (void)util_err(errno, NULL);
+ return (NULL);
+ }
+
+ /*
+ * If the string has a URI prefix, use it verbatim, otherwise prepend
+ * the default type for the operation.
+ */
+ if (strchr(s, ':') != NULL)
+ strcpy(name, s);
+ else
+ snprintf(name, len, "%s:%s", type, s);
+ return (name);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_misc.c b/src/third_party/wiredtiger/src/utilities/util_misc.c
new file mode 100644
index 00000000000..71e307a2e0e
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_misc.c
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+int
+util_cerr(const char *uri, const char *op, int ret)
+{
+ return (util_err(ret, "%s: cursor.%s", uri, op));
+}
+
+/*
+ * util_err --
+ * Report an error.
+ */
+int
+util_err(int e, const char *fmt, ...)
+{
+ va_list ap;
+
+ (void)fprintf(stderr, "%s: ", progname);
+ if (fmt != NULL) {
+ va_start(ap, fmt);
+ (void)vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ if (e != 0)
+ (void)fprintf(stderr, ": ");
+ }
+ if (e != 0)
+ (void)fprintf(stderr, "%s", wiredtiger_strerror(e));
+ (void)fprintf(stderr, "\n");
+ return (1);
+}
+
+/*
+ * util_read_line --
+ * Read a line from stdin into a ULINE.
+ */
+int
+util_read_line(ULINE *l, int eof_expected, int *eofp)
+{
+ static uint64_t line = 0;
+ size_t len;
+ int ch;
+
+ ++line;
+ *eofp = 0;
+
+ if (l->memsize == 0) {
+ if ((l->mem = realloc(l->mem, l->memsize + 1024)) == NULL)
+ return (util_err(errno, NULL));
+ l->memsize = 1024;
+ }
+ for (len = 0;; ++len) {
+ if ((ch = getchar()) == EOF) {
+ if (len == 0) {
+ if (eof_expected) {
+ *eofp = 1;
+ return (0);
+ }
+ return (util_err(0,
+ "line %" PRIu64 ": unexpected end-of-file",
+ line));
+ }
+ return (util_err(0,
+ "line %" PRIu64 ": no newline terminator", line));
+ }
+ if (ch == '\n')
+ break;
+ /*
+ * We nul-terminate the string so it's easier to convert the
+ * line into a record number, that means we always need one
+ * extra byte at the end.
+ */
+ if (len >= l->memsize - 1) {
+ if ((l->mem =
+ realloc(l->mem, l->memsize + 1024)) == NULL)
+ return (util_err(errno, NULL));
+ l->memsize += 1024;
+ }
+ ((uint8_t *)l->mem)[len] = (uint8_t)ch;
+ }
+
+ ((uint8_t *)l->mem)[len] = '\0'; /* nul-terminate */
+
+ return (0);
+}
+
+/*
+ * util_str2recno --
+ * Convert a string to a record number.
+ */
+int
+util_str2recno(const char *p, uint64_t *recnop)
+{
+ uint64_t recno;
+ char *endptr;
+
+ /*
+ * strtouq takes lots of things like hex values, signs and so on and so
+ * forth -- none of them are OK with us. Check the string starts with
+ * digit, that turns off the special processing.
+ */
+ if (!isdigit(p[0]))
+ goto format;
+
+ errno = 0;
+ recno = __wt_strtouq(p, &endptr, 0);
+ if (recno == ULLONG_MAX && errno == ERANGE)
+ return (util_err(ERANGE, "%s: invalid record number", p));
+
+ if (endptr[0] != '\0')
+format: return (util_err(EINVAL, "%s: invalid record number", p));
+
+ *recnop = recno;
+ return (0);
+}
+
+/*
+ * util_flush --
+ * Flush the file successfully, or drop it.
+ */
+int
+util_flush(WT_SESSION *session, const char *uri)
+{
+ WT_DECL_RET;
+ size_t len;
+ char *buf;
+
+ len = strlen(uri) + 100;
+ if ((buf = malloc(len)) == NULL)
+ return (util_err(errno, NULL));
+
+ (void)snprintf(buf, len, "target=(\"%s\")", uri);
+ if ((ret = session->checkpoint(session, buf)) != 0) {
+ ret = util_err(ret, "%s: session.checkpoint", uri);
+ (void)session->drop(session, uri, NULL);
+ }
+
+ free(buf);
+ return (ret);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_printlog.c b/src/third_party/wiredtiger/src/utilities/util_printlog.c
new file mode 100644
index 00000000000..7fc9bfa39b0
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_printlog.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_printlog(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch, printable;
+
+ printable = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "f:p")) != EOF)
+ switch (ch) {
+ case 'f': /* output file */
+ if (freopen(__wt_optarg, "w", stdout) == NULL) {
+ fprintf(stderr, "%s: %s: reopen: %s\n",
+ progname, __wt_optarg, strerror(errno));
+ return (1);
+ }
+ break;
+ case 'p':
+ printable = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* There should not be any more arguments. */
+ if (argc != 0)
+ return (usage());
+
+ WT_UNUSED(printable);
+ ret = __wt_txn_printlog(session, stdout);
+
+ if (ret != 0) {
+ fprintf(stderr, "%s: printlog failed: %s\n",
+ progname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "printlog [-p] [-f output-file]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_read.c b/src/third_party/wiredtiger/src/utilities/util_read.c
new file mode 100644
index 00000000000..d9a629e40e2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_read.c
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_read(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint64_t recno;
+ int ch, rkey, rval;
+ const char *uri, *value;
+
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining arguments are a uri followed by a list of keys. */
+ if (argc < 2)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ /* Open the object. */
+ if ((ret = session->open_cursor(
+ session, uri, NULL, NULL, &cursor)) != 0)
+ return (util_err(ret, "%s: session.open", uri));
+
+ /*
+ * A simple search only makes sense if the key format is a string or a
+ * record number, and the value format is a single string.
+ */
+ if (strcmp(cursor->key_format, "r") != 0 &&
+ strcmp(cursor->key_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: read command only possible when the key format is "
+ "a record number or string\n",
+ progname);
+ return (1);
+ }
+ rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0;
+ if (strcmp(cursor->value_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: read command only possible when the value format is "
+ "a string\n",
+ progname);
+ return (1);
+ }
+
+ /*
+ * Run through the keys, returning non-zero on error or if any requested
+ * key isn't found.
+ */
+ for (rval = 0; *++argv != NULL;) {
+ if (rkey) {
+ if (util_str2recno(*argv, &recno))
+ return (1);
+ cursor->set_key(cursor, recno);
+ } else
+ cursor->set_key(cursor, *argv);
+
+ switch (ret = cursor->search(cursor)) {
+ case 0:
+ if ((ret = cursor->get_value(cursor, &value)) != 0)
+ return (util_cerr(uri, "get_value", ret));
+ if (printf("%s\n", value) < 0)
+ return (util_err(EIO, NULL));
+ break;
+ case WT_NOTFOUND:
+ (void)util_err(0, "%s: not found", *argv);
+ rval = 1;
+ break;
+ default:
+ return (util_cerr(uri, "search", ret));
+ }
+ }
+
+ return (rval);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "read uri key ...\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_rename.c b/src/third_party/wiredtiger/src/utilities/util_rename.c
new file mode 100644
index 00000000000..8c2aeb30c59
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_rename.c
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_rename(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *uri, *newuri;
+
+ uri = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining arguments are the object uri and new name. */
+ if (argc != 2)
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+ newuri = argv[1];
+
+ if ((ret = session->rename(session, uri, newuri, NULL)) != 0) {
+ fprintf(stderr, "%s: rename %s to %s: %s\n",
+ progname, uri, newuri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (uri != NULL)
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "rename uri newuri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_salvage.c b/src/third_party/wiredtiger/src/utilities/util_salvage.c
new file mode 100644
index 00000000000..386365d8875
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_salvage.c
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_salvage(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ const char *force;
+ char *name;
+
+ force = NULL;
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF)
+ switch (ch) {
+ case 'F':
+ force = "force";
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the file name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "file")) == NULL)
+ return (1);
+
+ if ((ret = session->salvage(session, name, force)) != 0) {
+ fprintf(stderr, "%s: salvage(%s): %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "salvage [-F] uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_stat.c b/src/third_party/wiredtiger/src/utilities/util_stat.c
new file mode 100644
index 00000000000..caac560e839
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_stat.c
@@ -0,0 +1,103 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_stat(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ size_t urilen;
+ int all, ch, objname_free;
+ const char *pval, *desc;
+ char *objname, *uri;
+
+ all = objname_free = 0;
+ objname = uri = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "a")) != EOF)
+ switch (ch) {
+ case 'a':
+ all = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /*
+ * If there are no arguments, the statistics cursor operates on the
+ * connection, otherwise, the optional remaining argument is a file
+ * or LSM name.
+ */
+ switch (argc) {
+ case 0:
+ objname = (char *)"";
+ break;
+ case 1:
+ if ((objname = util_name(*argv, "table")) == NULL)
+ return (1);
+ objname_free = 1;
+ break;
+ default:
+ return (usage());
+ }
+
+ urilen = strlen("statistics:") + strlen(objname) + 1;
+ if ((uri = calloc(urilen, 1)) == NULL) {
+ fprintf(stderr, "%s: %s\n", progname, strerror(errno));
+ goto err;
+ }
+ snprintf(uri, urilen, "statistics:%s", objname);
+
+ if ((ret = session->open_cursor(session, uri, NULL,
+ all ? "statistics=(all)" : NULL, &cursor)) != 0) {
+ fprintf(stderr, "%s: cursor open(%s) failed: %s\n",
+ progname, uri, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* List the statistics. */
+ while (
+ (ret = cursor->next(cursor)) == 0 &&
+ (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0)
+ if (printf("%s=%s\n", desc, pval) < 0) {
+ ret = errno;
+ break;
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+
+ if (ret != 0) {
+ fprintf(stderr, "%s: cursor get(%s) failed: %s\n",
+ progname, objname, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ if (0) {
+err: ret = 1;
+ }
+ if (objname_free)
+ free(objname);
+ free(uri);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "stat -a [uri]\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_upgrade.c b/src/third_party/wiredtiger/src/utilities/util_upgrade.c
new file mode 100644
index 00000000000..b56caca2ccd
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_upgrade.c
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_upgrade(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ int ch;
+ char *name;
+
+ name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF)
+ switch (ch) {
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ if ((ret = session->upgrade(session, name, NULL)) != 0) {
+ fprintf(stderr, "%s: upgrade(%s): %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "upgrade uri\n",
+ progname, usage_prefix);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_verbose.c b/src/third_party/wiredtiger/src/utilities/util_verbose.c
new file mode 100644
index 00000000000..12ff1c5463c
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_verbose.c
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+/*
+ * __handle_error_verbose --
+ * Verbose WT_EVENT_HANDLER->handle_error implementation: send to stderr.
+ */
+static int
+__handle_error_verbose(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, int error, const char *errmsg)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+ WT_UNUSED(error);
+
+ return (fprintf(stderr, "%s\n", errmsg) < 0 ? EIO : 0);
+}
+
+/*
+ * __handle_message_verbose --
+ * Verbose WT_EVENT_HANDLER->handle_message implementation: send to stdout.
+ */
+static int
+__handle_message_verbose(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *message)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ return (printf("%s\n", message) < 0 ? EIO : 0);
+}
+
+/*
+ * __handle_progress_verbose --
+ * Default WT_EVENT_HANDLER->handle_progress implementation: ignore.
+ */
+static int
+__handle_progress_verbose(WT_EVENT_HANDLER *handler,
+ WT_SESSION *session, const char *operation, uint64_t progress)
+{
+ WT_UNUSED(handler);
+ WT_UNUSED(session);
+
+ return (
+ printf("\r\t%s %-20" PRIu64, operation, progress) < 0 ? EIO : 0);
+}
+
+static WT_EVENT_HANDLER __event_handler_verbose = {
+ __handle_error_verbose,
+ __handle_message_verbose,
+ __handle_progress_verbose,
+ NULL /* Close handler. */
+
+};
+
+WT_EVENT_HANDLER *verbose_handler = &__event_handler_verbose;
diff --git a/src/third_party/wiredtiger/src/utilities/util_verify.c b/src/third_party/wiredtiger/src/utilities/util_verify.c
new file mode 100644
index 00000000000..6ae5fdeec26
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_verify.c
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+#undef OPT_ARGS
+#undef USAGE_ARGS
+#ifdef HAVE_DIAGNOSTIC
+#define OPT_ARGS "d:"
+#define USAGE_ARGS \
+ "[-d dump_address | dump_blocks | dump_offsets=#,# | dump_pages] uri"
+#else
+#define OPT_ARGS ""
+#define USAGE_ARGS "uri"
+#endif
+
+int
+util_verify(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_DECL_RET;
+ size_t size;
+ int ch, dump_address, dump_blocks, dump_pages;
+ char *config, *dump_offsets, *name;
+
+ dump_address = dump_blocks = dump_pages = 0;
+ config = dump_offsets = name = NULL;
+ while ((ch = __wt_getopt(progname, argc, argv, OPT_ARGS)) != EOF)
+ switch (ch) {
+ case 'd':
+ if (strcmp(__wt_optarg, "dump_address") == 0)
+ dump_address = 1;
+ else if (strcmp(__wt_optarg, "dump_blocks") == 0)
+ dump_blocks = 1;
+ else if (
+ WT_PREFIX_MATCH(__wt_optarg, "dump_offsets=")) {
+ if (dump_offsets != NULL) {
+ fprintf(stderr,
+ "%s: only a single 'dump_offsets' "
+ "argument supported\n", progname);
+ return (usage());
+ }
+ dump_offsets =
+ __wt_optarg + strlen("dump_offsets=");
+ } else if (strcmp(__wt_optarg, "dump_pages") == 0)
+ dump_pages = 1;
+ else
+ return (usage());
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /* The remaining argument is the table name. */
+ if (argc != 1)
+ return (usage());
+ if ((name = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ /* Build the configuration string as necessary. */
+ if (dump_address || dump_blocks || dump_offsets != NULL || dump_pages) {
+ size =
+ strlen("dump_address,") +
+ strlen("dump_blocks,") +
+ strlen("dump_pages,") +
+ strlen("dump_offsets[],") +
+ (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20;
+ if ((config = malloc(size)) == NULL) {
+ (void)util_err(errno, NULL);
+ goto err;
+ }
+ snprintf(config, size,
+ "%s%s%s%s%s%s",
+ dump_address ? "dump_address," : "",
+ dump_blocks ? "dump_blocks," : "",
+ dump_offsets != NULL ? "dump_offsets=[" : "",
+ dump_offsets != NULL ? dump_offsets : "",
+ dump_offsets != NULL ? "]," : "",
+ dump_pages ? "dump_pages" : "");
+ }
+ if ((ret = session->verify(session, name, config)) != 0) {
+ fprintf(stderr, "%s: verify(%s): %s\n",
+ progname, name, wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ /* Verbose configures a progress counter, move to the next line. */
+ if (verbose)
+ printf("\n");
+
+ if (0) {
+err: ret = 1;
+ }
+
+ if (config != NULL)
+ free(config);
+ if (name != NULL)
+ free(name);
+
+ return (ret);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "verify %s\n",
+ progname, usage_prefix, USAGE_ARGS);
+ return (1);
+}
diff --git a/src/third_party/wiredtiger/src/utilities/util_write.c b/src/third_party/wiredtiger/src/utilities/util_write.c
new file mode 100644
index 00000000000..067b951c0cc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/utilities/util_write.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "util.h"
+
+static int usage(void);
+
+int
+util_write(WT_SESSION *session, int argc, char *argv[])
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ uint64_t recno;
+ int append, ch, overwrite, rkey;
+ const char *uri;
+ char config[100];
+
+ append = overwrite = 0;
+ while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF)
+ switch (ch) {
+ case 'a':
+ append = 1;
+ break;
+ case 'o':
+ overwrite = 1;
+ break;
+ case '?':
+ default:
+ return (usage());
+ }
+ argc -= __wt_optind;
+ argv += __wt_optind;
+
+ /*
+ * The remaining arguments are a uri followed by a list of values (if
+ * append is set), or key/value pairs (if append is not set).
+ */
+ if (append) {
+ if (argc < 2)
+ return (usage());
+ } else
+ if (argc < 3 || ((argc - 1) % 2 != 0))
+ return (usage());
+ if ((uri = util_name(*argv, "table")) == NULL)
+ return (1);
+
+ /* Open the object. */
+ (void)snprintf(config, sizeof(config), "%s,%s",
+ append ? "append=true" : "", overwrite ? "overwrite=true" : "");
+ if ((ret = session->open_cursor(
+ session, uri, NULL, config, &cursor)) != 0)
+ return (util_err(ret, "%s: session.open", uri));
+
+ /*
+ * A simple search only makes sense if the key format is a string or a
+ * record number, and the value format is a single string.
+ */
+ if (strcmp(cursor->key_format, "r") != 0 &&
+ strcmp(cursor->key_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: write command only possible when the key format is "
+ "a record number or string\n",
+ progname);
+ return (1);
+ }
+ rkey = strcmp(cursor->key_format, "r") == 0 ? 1 : 0;
+ if (strcmp(cursor->value_format, "S") != 0) {
+ fprintf(stderr,
+ "%s: write command only possible when the value format is "
+ "a string\n",
+ progname);
+ return (1);
+ }
+
+ /* Run through the values or key/value pairs. */
+ while (*++argv != NULL) {
+ if (!append) {
+ if (rkey) {
+ if (util_str2recno(*argv, &recno))
+ return (1);
+ cursor->set_key(cursor, recno);
+ } else
+ cursor->set_key(cursor, *argv);
+ ++argv;
+ }
+ cursor->set_value(cursor, *argv);
+
+ if ((ret = cursor->insert(cursor)) != 0)
+ return (util_cerr(uri, "search", ret));
+ }
+
+ return (0);
+}
+
+static int
+usage(void)
+{
+ (void)fprintf(stderr,
+ "usage: %s %s "
+ "write [-ao] uri key ...\n",
+ progname, usage_prefix);
+ return (1);
+}